Skip to content

Commit 109f7e2

Browse files
authored
Add sagemaker integration tests (#13)
* Fix single host training test. * Update test configuration. * Add integration tests: initial commit. * Move common logic into one place. * Rename test estimator file * Add non-distributed trained gpu model. * Add TODO about moving timeout to testing utils in container support or sdk. * Do not specify entrypoint in local tests.
1 parent 8e176d5 commit 109f7e2

File tree

13 files changed

+304
-90
lines changed

13 files changed

+304
-90
lines changed

lib/sagemaker-container-support

test/conftest.py

Lines changed: 61 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
import boto3
1314
import os
1415
import logging
1516
import platform
@@ -18,7 +19,7 @@
1819
import sys
1920
import tempfile
2021

21-
22+
from sagemaker import Session
2223
from test.utils import local_mode
2324

2425
logger = logging.getLogger(__name__)
@@ -36,6 +37,8 @@
3637
def pytest_addoption(parser):
3738
parser.addoption('--build-image', '-D', action="store_true")
3839
parser.addoption('--build-base-image', '-B', action="store_true")
40+
parser.addoption('--aws-id')
41+
parser.addoption('--instance-type')
3942
parser.addoption('--install-container-support', '-C', action="store_true")
4043
parser.addoption('--docker-base-name', default='pytorch')
4144
parser.addoption('--region', default='us-west-2')
@@ -46,40 +49,40 @@ def pytest_addoption(parser):
4649
parser.addoption('--tag', default=None)
4750

4851

49-
@pytest.fixture(scope='session')
50-
def docker_base_name(request):
52+
@pytest.fixture(scope='session', name='docker_base_name')
53+
def fixture_docker_base_name(request):
5154
return request.config.getoption('--docker-base-name')
5255

5356

54-
@pytest.fixture(scope='session')
55-
def region(request):
57+
@pytest.fixture(scope='session', name='region')
58+
def fixture_region(request):
5659
return request.config.getoption('--region')
5760

5861

59-
@pytest.fixture(scope='session')
60-
def framework_version(request):
62+
@pytest.fixture(scope='session', name='framework_version')
63+
def fixture_framework_version(request):
6164
return request.config.getoption('--framework-version')
6265

6366

64-
@pytest.fixture(scope='session')
65-
def py_version(request):
67+
@pytest.fixture(scope='session', name='py_version')
68+
def fixture_py_version(request):
6669
return 'py{}'.format(int(request.config.getoption('--py-version')))
6770

6871

69-
@pytest.fixture(scope='session')
70-
def processor(request):
72+
@pytest.fixture(scope='session', name='processor')
73+
def fixture_processor(request):
7174
return request.config.getoption('--processor')
7275

7376

74-
@pytest.fixture(scope='session')
75-
def tag(request, framework_version, processor, py_version):
77+
@pytest.fixture(scope='session', name='tag')
78+
def fixture_tag(request, framework_version, processor, py_version):
7679
provided_tag = request.config.getoption('--tag')
7780
default_tag = '{}-{}-{}'.format(framework_version, processor, py_version)
7881
return provided_tag if provided_tag else default_tag
7982

8083

81-
@pytest.fixture(scope='session')
82-
def docker_image(docker_base_name, tag):
84+
@pytest.fixture(scope='session', name='docker_image')
85+
def fixture_docker_image(docker_base_name, tag):
8386
return '{}:{}'.format(docker_base_name, tag)
8487

8588

@@ -96,20 +99,20 @@ def opt_ml():
9699
shutil.rmtree(tmp, True)
97100

98101

99-
@pytest.fixture(scope='session')
100-
def use_gpu(processor):
102+
@pytest.fixture(scope='session', name='use_gpu')
103+
def fixture_use_gpu(processor):
101104
return processor == 'gpu'
102105

103106

104-
@pytest.fixture(scope='session', autouse=True)
105-
def install_container_support(request):
107+
@pytest.fixture(scope='session', name='install_container_support', autouse=True)
108+
def fixture_install_container_support(request):
106109
install = request.config.getoption('--install-container-support')
107110
if install:
108111
local_mode.install_container_support()
109112

110113

111-
@pytest.fixture(scope='session', autouse=True)
112-
def build_base_image(request, framework_version, py_version, processor, tag, docker_base_name):
114+
@pytest.fixture(scope='session', name='build_base_image', autouse=True)
115+
def fixture_build_base_image(request, framework_version, py_version, processor, tag, docker_base_name):
113116
build_base_image = request.config.getoption('--build-base-image')
114117
if build_base_image:
115118
return local_mode.build_base_image(framework_name=docker_base_name,
@@ -122,8 +125,8 @@ def build_base_image(request, framework_version, py_version, processor, tag, doc
122125
return tag
123126

124127

125-
@pytest.fixture(scope='session', autouse=True)
126-
def build_image(request, framework_version, py_version, processor, tag, docker_base_name):
128+
@pytest.fixture(scope='session', name='build_image', autouse=True)
129+
def fixture_build_image(request, framework_version, py_version, processor, tag, docker_base_name):
127130
build_image = request.config.getoption('--build-image')
128131
if build_image:
129132
return local_mode.build_image(framework_name=docker_base_name,
@@ -134,3 +137,38 @@ def build_image(request, framework_version, py_version, processor, tag, docker_b
134137
cwd=os.path.join(dir_path, '..'))
135138

136139
return tag
140+
141+
142+
@pytest.fixture(scope='session', name='sagemaker_session')
143+
def fixture_sagemaker_session(region):
144+
return Session(boto_session=boto3.Session(region_name=region))
145+
146+
147+
@pytest.fixture(name='aws_id', scope='session')
148+
def fixture_aws_id(request):
149+
return request.config.getoption('--aws-id')
150+
151+
152+
@pytest.fixture(name='instance_type', scope='session')
153+
def fixture_instance_type(request):
154+
return request.config.getoption('--instance-type')
155+
156+
157+
@pytest.fixture(name='docker_registry', scope='session')
158+
def fixture_docker_registry(aws_id, region):
159+
return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region)
160+
161+
162+
@pytest.fixture(name='ecr_image', scope='session')
163+
def fixture_ecr_image(docker_registry, docker_base_name, tag):
164+
return '{}/{}:{}'.format(docker_registry, docker_base_name, tag)
165+
166+
167+
@pytest.fixture(scope='session', name='dist_cpu_backend', params=['tcp', 'gloo'])
168+
def fixture_dist_cpu_backend(request):
169+
return request.param
170+
171+
172+
@pytest.fixture(scope='session', name='dist_gpu_backend', params=['gloo'])
173+
def fixture_dist_gpu_backend(request):
174+
return request.param

test/integration/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,17 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
import os
14+
15+
resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources'))
16+
mnist_path = os.path.join(resources_path, 'mnist')
17+
mnist_script = os.path.join(mnist_path, 'mnist.py')
18+
data_dir = os.path.join(mnist_path, 'data')
19+
training_dir = os.path.join(data_dir, 'training')
20+
dist_operations_path = os.path.join(resources_path, 'distributed_operations.py')
21+
22+
mnist_1d_script = os.path.join(mnist_path, 'mnist_1d.py')
23+
model_cpu_dir = os.path.join(mnist_path, 'model_cpu')
24+
model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d')
25+
model_gpu_dir = os.path.join(mnist_path, 'model_gpu')
26+
model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d')

test/integration/local/test_distributed_training.py

Lines changed: 5 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,34 +10,14 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13-
import os
1413
import pytest
1514
import torch
15+
from test.integration import data_dir, dist_operations_path, mnist_script
1616
from test.utils import local_mode
1717

18-
resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'resources'))
19-
mnist_path = os.path.join(resources_path, 'mnist')
20-
mnist_script = os.path.join(mnist_path, 'mnist.py')
21-
data_dir = os.path.join(mnist_path, 'data')
22-
training_dir = os.path.join(data_dir, 'training')
23-
dist_operations_path = os.path.join(resources_path, 'distributed_operations.py')
24-
25-
ENTRYPOINT = ["python", "-m", "pytorch_container.start"]
26-
27-
28-
@pytest.fixture(scope='session', name='dist_cpu_backend', params=['tcp', 'gloo'])
29-
def fixture_dist_cpu_backend(request):
30-
return request.param
31-
32-
33-
@pytest.fixture(scope='session', name='dist_gpu_backend', params=['gloo'])
34-
def fixture_dist_gpu_backend(request):
35-
return request.param
36-
3718

3819
def test_dist_operations_path_cpu(docker_image, opt_ml, dist_cpu_backend):
39-
local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml,
40-
entrypoint=ENTRYPOINT, cluster_size=3,
20+
local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml, cluster_size=3,
4121
hyperparameters={'backend': dist_cpu_backend})
4222

4323
assert local_mode.file_exists(opt_ml, 'model/success'), 'Script success file was not created'
@@ -47,8 +27,7 @@ def test_dist_operations_path_cpu(docker_image, opt_ml, dist_cpu_backend):
4727

4828
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
4929
def test_dist_operations_path_gpu(docker_image, opt_ml, dist_gpu_backend):
50-
local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml,
51-
entrypoint=ENTRYPOINT, cluster_size=3,
30+
local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml, cluster_size=3,
5231
use_gpu=True, hyperparameters={'backend': dist_gpu_backend})
5332

5433
assert local_mode.file_exists(opt_ml, 'model/success'), 'Script success file was not created'
@@ -57,8 +36,7 @@ def test_dist_operations_path_gpu(docker_image, opt_ml, dist_gpu_backend):
5736

5837

5938
def test_mnist_cpu(docker_image, opt_ml, dist_cpu_backend):
60-
local_mode.train(mnist_script, data_dir, docker_image, opt_ml,
61-
entrypoint=ENTRYPOINT, cluster_size=2,
39+
local_mode.train(mnist_script, data_dir, docker_image, opt_ml, cluster_size=2,
6240
hyperparameters={'backend': dist_cpu_backend})
6341

6442
assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'
@@ -68,8 +46,7 @@ def test_mnist_cpu(docker_image, opt_ml, dist_cpu_backend):
6846

6947
@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
7048
def test_mnist_gpu(docker_image, opt_ml, dist_gpu_backend):
71-
local_mode.train(mnist_script, data_dir, docker_image, opt_ml,
72-
entrypoint=ENTRYPOINT, cluster_size=2,
49+
local_mode.train(mnist_script, data_dir, docker_image, opt_ml, cluster_size=2,
7350
use_gpu=True, hyperparameters={'backend': dist_gpu_backend})
7451

7552
assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'

test/integration/local/test_serving.py

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,50 +10,34 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13-
import os
1413
import json
1514
from six import StringIO, BytesIO
1615
import pytest
1716
import requests
1817
from test.utils import local_mode
1918
import torch
20-
import logging
2119
import torch.utils.data
2220
import torch.utils.data.distributed
2321
from torchvision import datasets, transforms
2422
import numpy as np
2523
from container_support.serving import JSON_CONTENT_TYPE, CSV_CONTENT_TYPE, NPY_CONTENT_TYPE
26-
27-
logger = logging.getLogger(__name__)
28-
logger.setLevel(logging.DEBUG)
29-
30-
mnist_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'resources', 'mnist')
31-
mnist_script = os.path.join(mnist_path, 'mnist.py')
32-
mnist_1d_script = os.path.join(mnist_path, 'mnist_1d.py')
33-
model_cpu_dir = os.path.join(mnist_path, 'model_cpu')
34-
model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d')
35-
model_gpu_dir = os.path.join(mnist_path, 'model_gpu')
36-
model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d')
37-
38-
data_dir = os.path.join(mnist_path, 'data')
39-
training_dir = os.path.join(data_dir, 'training')
40-
41-
ENTRYPOINT = ["python", "-m", "pytorch_container.start"]
24+
from test.integration import training_dir, mnist_script, mnist_1d_script, model_cpu_dir, model_gpu_dir, \
25+
model_cpu_1d_dir
4226

4327

4428
@pytest.fixture(name='serve_cpu')
4529
def fixture_serve_cpu(docker_image, opt_ml):
4630
def serve(model_dir=model_cpu_dir, script=mnist_script):
4731
return local_mode.serve(customer_script=script, model_dir=model_dir, image_name=docker_image,
48-
opt_ml=opt_ml, entrypoint=ENTRYPOINT)
32+
opt_ml=opt_ml)
4933
return serve
5034

5135

5236
@pytest.fixture(name='serve_gpu')
5337
def fixture_serve_gpu(docker_image, opt_ml):
54-
def serve(model_dir=model_cpu_dir, script=mnist_script):
38+
def serve(model_dir=model_gpu_dir, script=mnist_script):
5539
return local_mode.serve(customer_script=script, model_dir=model_dir, image_name=docker_image,
56-
use_gpu=True, opt_ml=opt_ml, entrypoint=ENTRYPOINT)
40+
use_gpu=True, opt_ml=opt_ml)
5741
return serve
5842

5943

@@ -119,7 +103,6 @@ def _assert_prediction_csv(test_loader, accept):
119103

120104

121105
def _get_test_data_loader(batch_size):
122-
logger.info('training dir: {}'.format(os.listdir(training_dir)))
123106
return torch.utils.data.DataLoader(
124107
datasets.MNIST(training_dir, train=False, transform=transforms.Compose([
125108
transforms.ToTensor(),

test/integration/local/test_single_machine_training.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,26 +10,12 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13-
import os
1413
from test.utils import local_mode
15-
import logging
16-
17-
logger = logging.getLogger(__name__)
18-
logger.setLevel(logging.DEBUG)
19-
20-
mnist_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources', 'mnist'))
21-
data_dir = os.path.join(mnist_path, 'data')
22-
23-
training_dir = os.path.join(data_dir, 'training')
24-
25-
mnist_script = 'mnist.py'
26-
27-
ENTRYPOINT = ["python", "-m", "pytorch_container.start"]
14+
from test.integration import data_dir, mnist_script
2815

2916

3017
def test_mnist_cpu(docker_image, opt_ml, use_gpu):
31-
local_mode.train(mnist_script, data_dir, docker_image, opt_ml,
32-
source_dir=mnist_path, use_gpu=use_gpu, entrypoint=ENTRYPOINT)
18+
local_mode.train(mnist_script, data_dir, docker_image, opt_ml, use_gpu=use_gpu)
3319

3420
assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'
3521
assert local_mode.file_exists(opt_ml, 'output/success'), 'Success file was not created'
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
import pytest
14+
15+
16+
@pytest.fixture(autouse=True)
17+
def skip_by_device_type(request, tag):
18+
if (request.node.get_marker('skip_gpu') and 'gpu' in tag) or (request.node.get_marker('skip_cpu') and 'cpu' in tag):
19+
pytest.skip('Skipping because tag is: {}'.format(tag))
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from sagemaker.pytorch.estimator import PyTorch
14+
15+
16+
class PytorchTestEstimator(PyTorch):
17+
def __init__(self, docker_image_uri, **kwargs):
18+
super(PytorchTestEstimator, self).__init__(**kwargs)
19+
self.docker_image_uri = docker_image_uri
20+
21+
def train_image(self):
22+
return self.docker_image_uri
23+
24+
def create_model(self):
25+
model = super(PytorchTestEstimator, self).create_model()
26+
model.image = self.docker_image_uri
27+
return model

0 commit comments

Comments
 (0)