From 0291535f006efa61405cbbcff495e531600c14e0 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Thu, 21 Nov 2019 16:06:51 -0800 Subject: [PATCH 01/11] Update version --- smdebug/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/_version.py b/smdebug/_version.py index 793c25142..a3a9bd544 100644 --- a/smdebug/_version.py +++ b/smdebug/_version.py @@ -1 +1 @@ -__version__ = "0.5a" +__version__ = "0.4.8" From 50940b8aedc673f5c7fcfff0f38302fe6388e1fa Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Fri, 22 Nov 2019 19:23:01 -0800 Subject: [PATCH 02/11] Bump up version to 0.4.10, skipping 0.4.9 due to confusion of reverting release --- smdebug/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/_version.py b/smdebug/_version.py index a3a9bd544..805e7c470 100644 --- a/smdebug/_version.py +++ b/smdebug/_version.py @@ -1 +1 @@ -__version__ = "0.4.8" +__version__ = "0.4.10" From 4b79a0fc4c46c82e0e7059cffa8733b8e9e52fcb Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Sat, 23 Nov 2019 05:43:08 +0000 Subject: [PATCH 03/11] Get integration tests working again without patch hacks --- .../tensorflow_integration_tests.py | 2 - tests/zero_code_change/tests/__init__.py | 0 .../hooks/test_mirrored_strategy.py | 493 +++++++++++++++ .../tensorflow/keras/test_keras_mirrored.py | 560 ++++++++++++++++++ .../tests/tensorflow/utils.py | 8 + tests/zero_code_change/tests_path.py | 6 - 6 files changed, 1061 insertions(+), 8 deletions(-) create mode 100644 tests/zero_code_change/tests/__init__.py create mode 100644 tests/zero_code_change/tests/tensorflow/hooks/test_mirrored_strategy.py create mode 100644 tests/zero_code_change/tests/tensorflow/keras/test_keras_mirrored.py create mode 100644 tests/zero_code_change/tests/tensorflow/utils.py delete mode 100644 tests/zero_code_change/tests_path.py diff --git a/tests/zero_code_change/tensorflow_integration_tests.py b/tests/zero_code_change/tensorflow_integration_tests.py index 2e8d3f867..a6b2bcbc1 100644 --- a/tests/zero_code_change/tensorflow_integration_tests.py +++ b/tests/zero_code_change/tensorflow_integration_tests.py @@ -14,8 +14,6 @@ We check that certain tensors are saved. Here in the test suite we delete the hook after every script. """ -from tests_path import * # isort:skip - # Standard Library import argparse diff --git a/tests/zero_code_change/tests/__init__.py b/tests/zero_code_change/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/zero_code_change/tests/tensorflow/hooks/test_mirrored_strategy.py b/tests/zero_code_change/tests/tensorflow/hooks/test_mirrored_strategy.py new file mode 100644 index 000000000..2fa105bfb --- /dev/null +++ b/tests/zero_code_change/tests/tensorflow/hooks/test_mirrored_strategy.py @@ -0,0 +1,493 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convolutional Neural Network Estimator for MNIST, built with tf.layers.""" + +# Future +from __future__ import absolute_import, division, print_function + +# Third Party +import numpy as np +import pytest +import tensorflow as tf +from tensorflow.python.client import device_lib +from tests.tensorflow.utils import create_trial_fast_refresh + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.core.modes import ModeKeys +from smdebug.exceptions import TensorUnavailableForStep +from smdebug.tensorflow import get_hook + + +def cnn_model_fn(features, labels, mode): + """Model function for CNN.""" + # Input Layer + # Reshape X to 4-D tensor: [batch_size, width, height, channels] + # MNIST images are 28x28 pixels, and have one color channel + input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) + + # Convolutional Layer #1 + # Computes 32 features using a 5x5 filter with ReLU activation. + # Padding is added to preserve width and height. + # Input Tensor Shape: [batch_size, 28, 28, 1] + # Output Tensor Shape: [batch_size, 28, 28, 32] + conv1 = tf.layers.conv2d( + inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu + ) + + # Pooling Layer #1 + # First max pooling layer with a 2x2 filter and stride of 2 + # Input Tensor Shape: [batch_size, 28, 28, 32] + # Output Tensor Shape: [batch_size, 14, 14, 32] + pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) + + # Convolutional Layer #2 + # Computes 64 features using a 5x5 filter. + # Padding is added to preserve width and height. + # Input Tensor Shape: [batch_size, 14, 14, 32] + # Output Tensor Shape: [batch_size, 14, 14, 64] + conv2 = tf.layers.conv2d( + inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu + ) + + # Pooling Layer #2 + # Second max pooling layer with a 2x2 filter and stride of 2 + # Input Tensor Shape: [batch_size, 14, 14, 64] + # Output Tensor Shape: [batch_size, 7, 7, 64] + pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) + + # Flatten tensor into a batch of vectors + # Input Tensor Shape: [batch_size, 7, 7, 64] + # Output Tensor Shape: [batch_size, 7 * 7 * 64] + pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) + + # Dense Layer + # Densely connected layer with 1024 neurons + # Input Tensor Shape: [batch_size, 7 * 7 * 64] + # Output Tensor Shape: [batch_size, 1024] + dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) + + # Add dropout operation; 0.6 probability that element will be kept + dropout = tf.layers.dropout( + inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN + ) + + # Logits layer + # Input Tensor Shape: [batch_size, 1024] + # Output Tensor Shape: [batch_size, 10] + logits = tf.layers.dense(inputs=dropout, units=10) + + predictions = { + # Generate predictions (for PREDICT and EVAL mode) + "classes": tf.argmax(input=logits, axis=1), + # Add `softmax_tensor` to the graph. It is used for PREDICT and by the + # `logging_hook`. + "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), + } + if mode == tf.estimator.ModeKeys.PREDICT: + return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) + + # Calculate Loss (for both TRAIN and EVAL modes) + loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) + + # Configure the Training Op (for TRAIN mode) + if mode == tf.estimator.ModeKeys.TRAIN: + optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) + optimizer = smd.get_hook().wrap_optimizer(optimizer) + train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) + return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) + + # Add evaluation metrics (for EVAL mode) + eval_metric_ops = { + "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) + } + return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) + + +def per_device_batch_size(batch_size, num_gpus): + """For multi-gpu, batch-size must be a multiple of the number of GPUs. + Note that this should eventually be handled by DistributionStrategies + directly. Multi-GPU support is currently experimental, however, + so doing the work here until that feature is in place. + Args: + batch_size: Global batch size to be divided among devices. This should be + equal to num_gpus times the single-GPU batch_size for multi-gpu training. + num_gpus: How many GPUs are used with DistributionStrategies. + Returns: + Batch size per device. + Raises: + ValueError: if batch_size is not divisible by number of devices + """ + if num_gpus <= 1: + return batch_size + + remainder = batch_size % num_gpus + if remainder: + err = ( + "When running with multiple GPUs, batch size " + "must be a multiple of the number of available GPUs. Found {} " + "GPUs with a batch size of {}; try --batch_size={} instead." + ).format(num_gpus, batch_size, batch_size - remainder) + raise ValueError(err) + return int(batch_size / num_gpus) + + +class InputFnProvider: + def __init__(self, train_batch_size): + self.train_batch_size = train_batch_size + self.__load_data() + + def __load_data(self): + # Load training and eval data + mnist = tf.contrib.learn.datasets.load_dataset("mnist") + self.train_data = mnist.train.images # Returns np.array + self.train_labels = np.asarray(mnist.train.labels, dtype=np.int32) + self.eval_data = mnist.test.images # Returns np.array + self.eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) + + def train_input_fn(self): + """An input function for training""" + # Shuffle, repeat, and batch the examples. + dataset = tf.data.Dataset.from_tensor_slices(({"x": self.train_data}, self.train_labels)) + dataset = dataset.shuffle(1000).repeat().batch(self.train_batch_size) + return dataset + + def eval_input_fn(self): + """An input function for evaluation or prediction""" + dataset = tf.data.Dataset.from_tensor_slices(({"x": self.eval_data}, self.eval_labels)) + dataset = dataset.batch(1).repeat() + return dataset + + +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return len([x.name for x in local_device_protos if x.device_type == "GPU"]) + + +def helper_mirrored( + trial_dir, + save_all=False, + num_steps=3, + save_config=None, + reduction_config=None, + include_collections=None, + steps=None, + zcc=False, + eval_distributed=False, + include_workers="all", +): + num_gpus = get_available_gpus() + num_devices = num_gpus if num_gpus > 0 else 1 + batch_size = 10 * num_devices + + # input_fn which serves Dataset + input_fn_provider = InputFnProvider(per_device_batch_size(batch_size, num_devices)) + + # Use multiple GPUs by MirroredStragtegy. + # All avaiable GPUs will be used if `num_gpus` is omitted. + # if num_devices > 1: + distribution = tf.contrib.distribute.MirroredStrategy() + # print("### Doing Multi GPU Training") + # else: + # distribution = None + # Pass to RunConfig + config = tf.estimator.RunConfig( + train_distribute=distribution, + eval_distribute=distribution if eval_distributed else None, + model_dir="/tmp/mnist_convnet_model", + ) + + if save_config is None: + save_config = smd.SaveConfig(save_interval=2) + + if include_collections is None: + include_collections = [ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.LOSSES, + ] + + if not zcc: + ts_hook = smd.SessionHook( + out_dir=trial_dir, + save_all=save_all, + include_collections=include_collections, + save_config=save_config, + reduction_config=reduction_config, + include_workers=include_workers, + ) + else: + print("zcc is passed. ignoring include_collections and save_config") + + mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, config=config) + if steps is None: + steps = ["train"] + + for s in steps: + if s == "train": + print("Starting train") + if not zcc: + ts_hook.set_mode(smd.modes.TRAIN) + # Train the model + mnist_classifier.train( + input_fn=input_fn_provider.train_input_fn, steps=num_steps, hooks=[ts_hook] + ) + else: + mnist_classifier.train(input_fn=input_fn_provider.train_input_fn, steps=num_steps) + elif s == "eval": + print("Starting eval") + + if not zcc: + ts_hook.set_mode(smd.modes.EVAL) + # Evaluate the model and print results + mnist_classifier.evaluate( + input_fn=input_fn_provider.eval_input_fn, steps=num_steps, hooks=[ts_hook] + ) + else: + mnist_classifier.evaluate(input_fn=input_fn_provider.eval_input_fn, steps=num_steps) + elif s == "predict": + print("Starting predict") + if not zcc: + ts_hook.set_mode(smd.modes.PREDICT) + # Evaluate the model and print results + p = mnist_classifier.predict( + input_fn=input_fn_provider.eval_input_fn, hooks=[ts_hook] + ) + else: + p = mnist_classifier.predict(input_fn=input_fn_provider.eval_input_fn) + for i in range(num_steps): + next(p) + get_hook()._cleanup() + return distribution + + +def skip_trial_check(): + # Skip trial check as in this case SMDebug is disabled for mirrored strategy + # trial will not be loaded + import tensorflow as tf + from packaging import version + + if version.parse(tf.__version__) < version.parse("1.14.0"): + return True + else: + return False + + +@pytest.mark.slow +def test_basic(out_dir, zcc=False): + strategy = helper_mirrored( + out_dir, + steps=["train", "eval", "predict", "train"], + include_collections=[ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.LOSSES, + ], + eval_distributed=False, + zcc=zcc, + ) + if skip_trial_check(): + return + + tr = create_trial_fast_refresh(out_dir) + # wts, grads, losses + print(tr.tensors()) + assert len(tr.tensors()) == 8 + 8 + (1 * strategy.num_replicas_in_sync) + 1 + assert len(tr.steps()) == 7 + assert len(tr.steps(ModeKeys.TRAIN)) == 3 + assert len(tr.steps(ModeKeys.EVAL)) == 2 + assert len(tr.steps(ModeKeys.PREDICT)) == 2 + + assert "dense_1/kernel:0" in tr.tensors(collection="weights") + for tname in tr.tensors(collection="weights"): + for s in tr.tensor(tname).steps(ModeKeys.TRAIN): + assert len(tr.tensor(tname).workers(s, ModeKeys.TRAIN)) == strategy.num_replicas_in_sync + for worker in tr.tensor(tname).workers(s, ModeKeys.TRAIN): + assert tr.tensor(tname).value(s, worker=worker, mode=ModeKeys.TRAIN) is not None + for s in tr.tensor(tname).steps(ModeKeys.EVAL): + assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == strategy.num_replicas_in_sync + assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None + + tensornames = tr.tensors(regex="Identity_\d+:0") + for s in tr.tensor(tensornames[0]).steps(ModeKeys.TRAIN): + for w in tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN): + assert tr.tensor(tensornames[0]).value(s, worker=w, mode=ModeKeys.TRAIN) is not None + assert ( + len(tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN)) + == strategy.num_replicas_in_sync + ) + + for tname in tr.tensors(collection="losses"): + if tname != tensornames[0]: + for s in tr.tensor(tname).steps(ModeKeys.TRAIN): + assert len(tr.tensor(tname).workers(s, ModeKeys.TRAIN)) == 1 + assert tr.tensor(tname).value(s, mode=ModeKeys.TRAIN) is not None + + tname = "sparse_softmax_cross_entropy_loss/value:0" + for s in tr.tensor(tname).steps(ModeKeys.EVAL): + assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == strategy.num_replicas_in_sync + assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None + + +@pytest.mark.slow +def test_eval_distributed(out_dir): + strategy = helper_mirrored( + out_dir, + steps=["train", "eval"], + include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.LOSSES], + eval_distributed=True, + ) + if skip_trial_check(): + return + tr = create_trial_fast_refresh(out_dir) + assert len(tr.tensors()) == 8 + 1 * strategy.num_replicas_in_sync + 1 + assert len(tr.steps()) == 4 + assert len(tr.steps(ModeKeys.TRAIN)) == 2 + assert len(tr.steps(ModeKeys.EVAL)) == 2 + + for tname in tr.tensors(collection="weights"): + for s in tr.tensor(tname).steps(ModeKeys.TRAIN): + assert len(tr.tensor(tname).workers(s, ModeKeys.TRAIN)) == strategy.num_replicas_in_sync + for worker in tr.tensor(tname).workers(s, ModeKeys.TRAIN): + assert tr.tensor(tname).value(s, worker=worker, mode=ModeKeys.TRAIN) is not None + for s in tr.tensor(tname).steps(ModeKeys.EVAL): + assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == strategy.num_replicas_in_sync + assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None + + tensornames = tr.tensors(regex="Identity_\d+:0") + for s in tr.tensor(tensornames[0]).steps(ModeKeys.TRAIN): + for w in tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN): + assert tr.tensor(tensornames[0]).value(s, worker=w, mode=ModeKeys.TRAIN) is not None + assert ( + len(tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN)) + == strategy.num_replicas_in_sync + ) + + for tname in tr.tensors(collection="losses"): + for s in tr.tensor(tname).steps(ModeKeys.EVAL): + assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == 1 + assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None + if tname != tensornames[0]: + for s in tr.tensor(tname).steps(ModeKeys.TRAIN): + assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == 1 + assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None + + +@pytest.mark.slow +def test_reductions(out_dir): + strategy = helper_mirrored( + out_dir, + steps=["train", "eval"], + reduction_config=smd.ReductionConfig( + reductions=["sum", "max"], abs_reductions=["sum", "max"], norms=["l1"] + ), + include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.LOSSES], + eval_distributed=True, + ) + if skip_trial_check(): + return + + tr = create_trial_fast_refresh(out_dir) + assert len(tr.tensors()) == 8 + 1 * strategy.num_replicas_in_sync + 1 + assert len(tr.steps()) == 4 + assert len(tr.steps(ModeKeys.TRAIN)) == 2 + assert len(tr.steps(ModeKeys.EVAL)) == 2 + + for tname in tr.tensors(collection="weights"): + for s in tr.tensor(tname).steps(ModeKeys.TRAIN): + try: + tr.tensor(tname).value(s, mode=ModeKeys.TRAIN) + assert False + except TensorUnavailableForStep: + # for some tensors l1 reduction can't be saved due to improper dimensions for the reduction + assert len(tr.tensor(tname).reduction_values(s, mode=ModeKeys.TRAIN)) >= 4 + + for s in tr.tensor(tname).steps(ModeKeys.EVAL): + try: + tr.tensor(tname).value(s, mode=ModeKeys.EVAL) + assert False + except TensorUnavailableForStep: + # for some tensors l1 reduction can't be saved due to improper dimensions for the reduction + assert len(tr.tensor(tname).reduction_values(s, mode=ModeKeys.EVAL)) >= 4 + + for tname in tr.tensors(collection="losses"): + for s in tr.tensor(tname).steps(ModeKeys.EVAL): + assert len(tr.tensor(tname).reduction_values(s, mode=ModeKeys.EVAL)) == 0 + assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None + + for tname in tr.tensors(collection="losses"): + for s in tr.tensor(tname).steps(ModeKeys.TRAIN): + assert len(tr.tensor(tname).reduction_values(s, mode=ModeKeys.TRAIN)) == 0 + assert tr.tensor(tname).value(s, mode=ModeKeys.TRAIN) is not None + + +@pytest.mark.slow +def test_save_all(out_dir): + strategy = helper_mirrored( + out_dir, steps=["train"], num_steps=1, save_all=True, eval_distributed=True + ) + if skip_trial_check(): + return + tr = create_trial_fast_refresh(out_dir) + assert len(tr.tensors()) > 100 + assert len(tr.steps()) + assert len(tr.tensors(collection="weights")) + assert len(tr.tensors(collection="biases")) + assert len(tr.tensors(collection="gradients")) + + +@pytest.mark.slow +def test_save_all_worker(out_dir): + # skip test if no gpus available + if get_available_gpus() == 0: + return + strategy = helper_mirrored( + out_dir, + steps=["train"], + num_steps=1, + save_all=True, + eval_distributed=True, + include_workers="all", + ) + tr = create_trial_fast_refresh(out_dir) + assert len(tr.steps()) + assert len(tr.workers()) == get_available_gpus() + assert len(tr.tensors(collection="weights")) + assert "conv2d/kernel:0" in tr.tensors(collection="weights") + assert len(tr.tensor("conv2d/kernel:0").workers(0)) == strategy.num_replicas_in_sync + assert len(tr.tensors(collection="biases")) + assert "conv2d/bias:0" in tr.tensors(collection="biases") + assert len(tr.tensor("conv2d/bias:0").workers(0)) == strategy.num_replicas_in_sync + assert len(tr.tensors(collection="gradients")) + + +@pytest.mark.slow +def test_save_one_worker(out_dir): + strategy = helper_mirrored( + out_dir, + steps=["train"], + num_steps=1, + save_all=True, + eval_distributed=True, + include_workers="one", + ) + tr = create_trial_fast_refresh(out_dir) + assert len(tr.workers()) == 1 + assert len(tr.steps()) + assert len(tr.tensors(collection="weights")) + assert len(tr.tensors(collection="biases")) + assert len(tr.tensors(collection="gradients")) diff --git a/tests/zero_code_change/tests/tensorflow/keras/test_keras_mirrored.py b/tests/zero_code_change/tests/tensorflow/keras/test_keras_mirrored.py new file mode 100644 index 000000000..71b5e6923 --- /dev/null +++ b/tests/zero_code_change/tests/tensorflow/keras/test_keras_mirrored.py @@ -0,0 +1,560 @@ +# Future +from __future__ import absolute_import, division, print_function, unicode_literals + +# Standard Library +import os + +# Third Party +import pytest +import tensorflow as tf +import tensorflow_datasets as tfds +from tensorflow.python.client import device_lib +from tests.tensorflow.utils import create_trial_fast_refresh + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.access_layer import has_training_ended +from smdebug.core.collection import CollectionKeys +from smdebug.core.modes import ModeKeys +from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.exceptions import TensorUnavailable, TensorUnavailableForStep +from smdebug.tensorflow import ReductionConfig, SaveConfig +from smdebug.tensorflow.keras import KerasHook + +tfds.disable_progress_bar() + + +class FetchTensorCallback(tf.keras.callbacks.Callback): + def __init__(self, tensors): + self.tensors = tensors + self.fetches_added = False + + def _callback_fn(self, tensor_val): + assert tensor_val is not None + + def on_train_batch_begin(self, batch, logs): + try: + from tensorflow.python.keras.distribute.distributed_training_utils import ( + get_distributed_model, + ) + from tensorflow.python.keras.utils.mode_keys import ModeKeys as KerasModeKeys + + for t in self.tensors: + x = get_distributed_model(self.model, KerasModeKeys.TRAIN)._distributed_function + x.fetches.append(t) + x.fetch_callbacks[t] = self._callback_fn + self.fetches_added = True + except ImportError: + pass + + def on_train_batch_end(self, batch, logs): + if self.fetches_added: + # these should only be added if these were available above + from tensorflow.python.keras.distribute.distributed_training_utils import ( + get_distributed_model, + ) + from tensorflow.python.keras.utils.mode_keys import ModeKeys as KerasModeKeys + + for t in self.tensors: + x = get_distributed_model(self.model, KerasModeKeys.TRAIN)._distributed_function + x.fetches.remove(t) + del x.fetch_callbacks[t] + self.fetches_added = False + + +def get_available_gpus(): + local_device_protos = device_lib.list_local_devices() + return len([x.name for x in local_device_protos if x.device_type == "GPU"]) + + +def train_model( + trial_dir, + save_all=False, + hook=None, + include_collections=None, + reduction_config=None, + save_config=None, + use_keras_optimizer=True, + eager=False, + create_relu_collection=False, + strategy=None, + steps=None, + add_callbacks=None, + zcc=False, + include_workers="all", +): + print(tf.__version__) + tf.keras.backend.clear_session() + + datasets, info = tfds.load(name="mnist", with_info=True, as_supervised=True) + + mnist_train, mnist_test = datasets["train"], datasets["test"] + + if strategy is None: + strategy = tf.distribute.MirroredStrategy() + + # You can also do info.splits.total_num_examples to get the total + # number of examples in the dataset. + + BUFFER_SIZE = 10000 + + BATCH_SIZE_PER_REPLICA = 64 + BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync + + def scale(image, label): + image = tf.cast(image, tf.float32) + image /= 255 + + return image, label + + train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE) + eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE) + + if hook is None and not zcc: + if save_config is None: + save_config = SaveConfig(save_interval=3) + + hook = KerasHook( + out_dir=trial_dir, + save_config=save_config, + reduction_config=reduction_config, + include_collections=include_collections, + save_all=save_all, + include_workers=include_workers, + ) + + if not save_all and include_collections is not None: + for cname in hook.include_collections: + if cname not in include_collections: + hook.get_collection(cname).save_config = SaveConfig(end_step=0) + + if use_keras_optimizer: + opt = tf.keras.optimizers.Adam() + else: + opt = tf.train.AdamOptimizer(0.1) + + if not zcc: + opt = hook.wrap_optimizer(opt) + + with strategy.scope(): + relu_layer = tf.keras.layers.Dense(64, activation="relu") + model = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + relu_layer, + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + model.compile( + loss="sparse_categorical_crossentropy", + optimizer=opt, + run_eagerly=eager, + metrics=["accuracy"], + ) + + if create_relu_collection: + hook.get_collection("relu").add_keras_layer(relu_layer, inputs=True, outputs=True) + + hooks = [] + if add_callbacks: + if "tensorboard" in add_callbacks: + hooks.append( + # write_grads = True causes crash saying handle must be created in scope + # erorr like this https://stackoverflow.com/questions/56836895/custom-training-loop-using-tensorflow-gpu-1-14-and-tf-distribute-mirroredstrateg + # this crash is even if tornasole callback is off + tf.keras.callbacks.TensorBoard( + log_dir="./logs", histogram_freq=4, write_images=True + ) + ) + if "fetch_tensor" in add_callbacks: + hooks.append(FetchTensorCallback(model.weights)) + if not zcc: + hooks.append(hook) + + if steps is None: + steps = ["train"] + for step in steps: + if step == "train": + model.fit(train_dataset, epochs=1, steps_per_epoch=10, callbacks=hooks, verbose=0) + elif step == "eval": + model.evaluate(eval_dataset, steps=10, callbacks=hooks, verbose=0) + elif step == "predict": + model.predict(train_dataset, steps=4, callbacks=hooks, verbose=0) + + smd.get_hook()._cleanup() + return strategy + + +@pytest.mark.skip( + "needs to be run individually as it complains that eager " + "needs to be set at startup, but pytest " + "does not allow controlling order of tests" +) +def test_tf_keras_eager(out_dir): + tf.enable_eager_execution() + train_model(out_dir, eager=True, steps=["train"]) + tf.disable_eager_execution() + + +@pytest.mark.skip( + "needs to be run individually as it complains that eager " + "needs to be set at startup, but pytest " + "does not allow controlling order of tests" +) +def test_tf_keras_eager_env(out_dir): + tf.enable_eager_execution() + train_model(out_dir, eager=False, steps=["train"]) + tf.disable_eager_execution() + + +def exhaustive_check(trial_dir, zcc=False, include_workers="one"): + include_collections = [ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.LOSSES, + CollectionKeys.OUTPUTS, + CollectionKeys.METRICS, + CollectionKeys.OPTIMIZER_VARIABLES, + ] + strategy = train_model( + trial_dir, + include_collections=include_collections, + steps=["train", "eval", "predict", "train"], + include_workers=include_workers, + zcc=zcc, + ) + + tr = create_trial_fast_refresh(trial_dir) + print(tr.tensors()) + + if include_workers == "all": + assert len(tr.workers()) == strategy.num_replicas_in_sync + assert len(tr.tensors()) == (6 + 6 + 1 + 3 + strategy.num_replicas_in_sync * 3 + 5) + else: + assert len(tr.workers()) == 1 + assert len(tr.tensors()) == (6 + 6 + 1 + 3 + 1 * 3 + 5) + + # 6 weights, 6 gradients, 1 loss, 3 metrics, 24 outputs (8 for each mode), 5 optimizer variables + assert len(tr.modes()) == 3 + assert len(tr.steps()) == 14 + assert len(tr.steps(ModeKeys.TRAIN)) == 8 # 0, 3, 6, 9, 12, 15, 18, 19(end of epoch) + assert len(tr.steps(ModeKeys.EVAL)) == 4 + assert len(tr.steps(ModeKeys.PREDICT)) == 2 # ran 4 steps above + + assert len(tr.tensors(collection=CollectionKeys.BIASES)) == 3 + wtnames = tr.tensors(collection=CollectionKeys.WEIGHTS) + assert len(wtnames) == 3 + + for wtname in wtnames: + assert len(tr.tensor(wtname).steps()) == 13, wtname + assert len(tr.tensor(wtname).steps(ModeKeys.TRAIN)) == 7 + for s in tr.tensor(wtname).steps(ModeKeys.TRAIN): + assert tr.tensor(wtname).value(s, mode=ModeKeys.TRAIN) is not None + for worker in tr.workers(): + assert tr.tensor(wtname).value(s, mode=ModeKeys.TRAIN, worker=worker) is not None + assert len(tr.tensor(wtname).steps(ModeKeys.EVAL)) == 4 + for s in tr.tensor(wtname).steps(ModeKeys.EVAL): + assert tr.tensor(wtname).value(s, mode=ModeKeys.EVAL) is not None + for worker in tr.workers(): + assert tr.tensor(wtname).value(s, mode=ModeKeys.EVAL, worker=worker) is not None + assert len(tr.tensor(wtname).steps(ModeKeys.PREDICT)) == 2 + + gradnames = tr.tensors(collection=CollectionKeys.GRADIENTS) + assert len(gradnames) == 6 + for gradname in gradnames: + assert len(tr.tensor(gradname).steps(ModeKeys.TRAIN)) == 7 + for s in tr.tensor(gradname).steps(ModeKeys.TRAIN): + assert tr.tensor(gradname).value(s, mode=ModeKeys.TRAIN) is not None + assert len(tr.tensor(gradname).steps(ModeKeys.EVAL)) == 0 + assert len(tr.tensor(gradname).steps(ModeKeys.PREDICT)) == 0 + + optvarnames = tr.tensors(collection=CollectionKeys.OPTIMIZER_VARIABLES) + assert len(optvarnames) == 5 + for optvarname in optvarnames: + assert len(tr.tensor(optvarname).steps(ModeKeys.TRAIN)) == 7 + for s in tr.tensor(optvarname).steps(ModeKeys.TRAIN): + assert tr.tensor(optvarname).value(s, mode=ModeKeys.TRAIN) is not None + assert len(tr.tensor(optvarname).steps(ModeKeys.EVAL)) == 0 + assert len(tr.tensor(optvarname).steps(ModeKeys.PREDICT)) == 0 + + assert len(tr.tensors(collection=CollectionKeys.LOSSES)) == 1 + loss_name = tr.tensors(collection=CollectionKeys.LOSSES)[0] + # loss is not in predict mode (so less 2) + # add one for end of epoch + assert len(tr.tensor(loss_name).steps(ModeKeys.TRAIN)) == 8 + assert len(tr.tensor(loss_name).steps(ModeKeys.EVAL)) == 4 + assert len(tr.tensor(loss_name).steps(ModeKeys.PREDICT)) == 0 + assert len(tr.tensor(loss_name).steps()) == 12 + + metricnames = tr.tensors(collection=CollectionKeys.METRICS) + assert len(metricnames) == 3 + + +@pytest.mark.slow +def test_tf_keras(out_dir, zcc=False, include_workers="all"): + exhaustive_check(out_dir, zcc=zcc, include_workers=include_workers) + + +@pytest.mark.slow +def test_tf_keras_non_keras_opt(out_dir): + include_collections = [ + CollectionKeys.GRADIENTS, + CollectionKeys.OPTIMIZER_VARIABLES, + CollectionKeys.METRICS, + ] + train_model( + out_dir, + include_collections=include_collections, + use_keras_optimizer=False, + steps=["train", "eval"], + ) + tr = create_trial_fast_refresh(out_dir) + assert len(tr.modes()) == 2 + assert len(tr.steps(ModeKeys.TRAIN)) == 4 # 0, 3, 6, 9 + assert len(tr.tensors(collection=CollectionKeys.GRADIENTS)) == 6 + gradient_name = tr.tensors(collection=CollectionKeys.GRADIENTS)[0] + assert len(tr.tensor(gradient_name).steps(ModeKeys.TRAIN)) == 4 + assert len(tr.tensor(gradient_name).steps(ModeKeys.EVAL)) == 0 + + # not supported for non keras optimizer with keras + assert len(tr.tensors(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 0 + + +@pytest.mark.slow +def test_save_all(out_dir): + strategy = train_model( + out_dir, + include_collections=None, + save_all=True, + save_config=SaveConfig(save_steps=[5]), + steps=["train"], + ) + tr = create_trial_fast_refresh(out_dir) + print(tr.tensors()) + assert ( + len(tr.tensors()) + == 6 + 6 + 5 + 3 + 1 + 3 * strategy.num_replicas_in_sync + 2 * strategy.num_replicas_in_sync + ) + # weights, grads, optimizer_variables, metrics, losses, outputs + assert len(tr.steps()) == 3 + + +@pytest.mark.slow +def test_save_one_worker(out_dir): + strategy = train_model( + out_dir, + include_collections=None, + save_all=True, + save_config=SaveConfig(save_steps=[5]), + steps=["train"], + include_workers="one", + ) + tr = create_trial_fast_refresh(out_dir) + assert len(tr.workers()) == 1 + assert len(tr.steps()) + assert len(tr.tensors(collection="weights")) + assert len(tr.tensors(collection="weights")) + assert len(tr.tensor(tr.tensors(collection="weights")[0]).workers(0)) == 1 + assert len(tr.tensors(collection="biases")) + assert len(tr.tensor(tr.tensors(collection="biases")[0]).workers(0)) == 1 + assert len(tr.tensors(collection="gradients")) + + +@pytest.mark.slow +def test_save_all_workers(out_dir, zcc=False): + # Skip if no GPUS + if get_available_gpus() == 0: + return + strategy = train_model( + out_dir, + include_collections=None, + save_all=True, + save_config=SaveConfig(save_steps=[5]), + steps=["train"], + include_workers="all", + ) + tr = create_trial_fast_refresh(out_dir) + assert len(tr.workers()) == get_available_gpus() + assert len(tr.tensors(collection="weights")) + assert ( + len(tr.tensor(tr.tensors(collection="weights")[0]).workers(0)) + == strategy.num_replicas_in_sync + ) + + assert "conv2d/weights/conv2d/kernel:0" in tr.tensors(collection="weights") + assert ( + len(tr.tensor("conv2d/weights/conv2d/kernel:0").workers(0)) == strategy.num_replicas_in_sync + ) + + assert len(tr.tensors(collection="biases")) + assert "conv2d/weights/conv2d/bias:0" in tr.tensors(collection="biases") + assert ( + len(tr.tensor(tr.tensors(collection="biases")[0]).workers(0)) + == strategy.num_replicas_in_sync + ) + assert len(tr.tensors(collection="gradients")) + + +@pytest.mark.slow +def test_base_reductions(out_dir): + train_model( + out_dir, + include_collections=[ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.METRICS, + CollectionKeys.LOSSES, + ], + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + steps=["train"], + ) + + tr = create_trial_fast_refresh(out_dir) + weight_name = tr.tensors(collection=CollectionKeys.WEIGHTS)[0] + + try: + tr.tensor(weight_name).value(0) + assert False + except TensorUnavailableForStep: + assert tr.tensor(weight_name).reduction_values(0) + + loss_name = tr.tensors(collection=CollectionKeys.LOSSES)[0] + assert tr.tensor(loss_name).value(0) is not None + + metric_name = tr.tensors(collection=CollectionKeys.METRICS)[0] + assert tr.tensor(metric_name).value(0) is not None + + +@pytest.mark.slow +def test_collection_reductions(out_dir): + tf.reset_default_graph() + tf.keras.backend.clear_session() + hook = KerasHook( + out_dir=out_dir, + save_config=SaveConfig(save_interval=3), + include_collections=[ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + ], + ) + hook.get_collection(CollectionKeys.GRADIENTS).reduction_config = ReductionConfig(norms=["l1"]) + train_model(out_dir, hook=hook, steps=["train"]) + + tr = create_trial_fast_refresh(out_dir) + weight_name = tr.tensors(collection=CollectionKeys.WEIGHTS)[0] + grad_name = tr.tensors(collection=CollectionKeys.GRADIENTS)[0] + + try: + tr.tensor(weight_name).value(0) + tr.tensor(grad_name).value(0) + assert False + except TensorUnavailableForStep: + try: + assert tr.tensor(weight_name).reduction_value(0, "l1") is not None + except ValueError: + # some tensors reduction can't be computed + pass + except TensorUnavailable: + # sometimes we might not have tensor saved if it was only being + # saved as reduction and the reduction computation failed + pass + + +@pytest.mark.slow +def test_training_end(out_dir): + train_model(out_dir, include_collections=[CollectionKeys.OUTPUTS], steps=["train"]) + assert has_training_ended(out_dir) is True + + +@pytest.mark.slow +def test_collection_add(out_dir): + strategy = train_model( + out_dir, + include_collections=["relu"], + save_config=SaveConfig(save_interval=9), + create_relu_collection=True, + steps=["train"], + ) + + tr = create_trial_fast_refresh(out_dir) + relu_coll_tensor_names = tr.tensors(collection="relu") + + assert len(relu_coll_tensor_names) == strategy.num_replicas_in_sync * 2 + assert tr.tensor(relu_coll_tensor_names[0]).value(0) is not None + assert tr.tensor(relu_coll_tensor_names[1]).value(0) is not None + + +@pytest.mark.slow +def test_include_regex(out_dir): + hook = KerasHook( + out_dir=out_dir, + save_config=SaveConfig(save_interval=9), + include_collections=["custom_coll"], + include_workers="all", + ) + hook.get_collection("custom_coll").include("dense") + strategy = train_model(out_dir, hook=hook, steps=["train"]) + + tr = create_trial_fast_refresh(out_dir) + tnames = tr.tensors(collection="custom_coll") + + assert len(tnames) == 4 + 3 * strategy.num_replicas_in_sync + for tname in tnames: + assert tr.tensor(tname).value(0) is not None + + +@pytest.mark.slow +def test_clash_with_tb_callback(out_dir): + train_model( + out_dir, + save_config=SaveConfig(save_interval=9), + include_collections=[ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.GRADIENTS, + CollectionKeys.LOSSES, + CollectionKeys.METRICS, + ], + steps=["train"], + add_callbacks=["tensorboard"], + ) + tr = create_trial_fast_refresh(out_dir) + assert len(tr.tensors()) == 16 + + +@pytest.mark.slow +def test_clash_with_custom_callback(out_dir): + strategy = train_model( + out_dir, + include_collections=[ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.OUTPUTS, + CollectionKeys.GRADIENTS, + ], + save_config=SaveConfig(save_interval=9), + steps=["train"], + add_callbacks=["fetch_tensor"], + ) + tr = create_trial_fast_refresh(out_dir) + assert len(tr.tensors()) == 6 + 6 + strategy.num_replicas_in_sync * 1 + 3 + + +def test_one_device(out_dir): + strategy = train_model( + out_dir, + include_collections=[ + CollectionKeys.WEIGHTS, + CollectionKeys.BIASES, + CollectionKeys.OUTPUTS, + CollectionKeys.GRADIENTS, + ], + save_config=SaveConfig(save_interval=9), + strategy=tf.distribute.OneDeviceStrategy(device="/cpu:0"), + steps=["train"], + ) + assert os.path.isdir(os.path.join(out_dir, "events")) is False diff --git a/tests/zero_code_change/tests/tensorflow/utils.py b/tests/zero_code_change/tests/tensorflow/utils.py new file mode 100644 index 000000000..e8e3d8af6 --- /dev/null +++ b/tests/zero_code_change/tests/tensorflow/utils.py @@ -0,0 +1,8 @@ +# First Party +from smdebug.trials import create_trial + + +def create_trial_fast_refresh(path, **kwargs): + tr = create_trial(path, **kwargs) + tr.training_end_delay_refresh = 0.01 + return tr diff --git a/tests/zero_code_change/tests_path.py b/tests/zero_code_change/tests_path.py deleted file mode 100644 index f58ea66ce..000000000 --- a/tests/zero_code_change/tests_path.py +++ /dev/null @@ -1,6 +0,0 @@ -# Standard Library -import os -import sys - -# Hack to import tests.tensorflow -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) From ce13ca4780ad6f831d37f6aa560d9d2d58dc293f Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Sat, 23 Nov 2019 16:12:17 -0800 Subject: [PATCH 04/11] Release 0.4.11 --- smdebug/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/_version.py b/smdebug/_version.py index 805e7c470..58ce5cd17 100644 --- a/smdebug/_version.py +++ b/smdebug/_version.py @@ -1 +1 @@ -__version__ = "0.4.10" +__version__ = "0.4.11" From 04717d24e7085efaee027e88c58e17cf47cc6770 Mon Sep 17 00:00:00 2001 From: Rahul Huilgol Date: Tue, 26 Nov 2019 16:15:28 -0800 Subject: [PATCH 05/11] Update version --- smdebug/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/_version.py b/smdebug/_version.py index 58ce5cd17..9b084a609 100644 --- a/smdebug/_version.py +++ b/smdebug/_version.py @@ -1 +1 @@ -__version__ = "0.4.11" +__version__ = "0.4.12" From d634b240978bd9a8958ccb3fac458079d3b32814 Mon Sep 17 00:00:00 2001 From: Owen Thomas <31292660+owen-t@users.noreply.github.com> Date: Fri, 29 Nov 2019 18:14:17 -0800 Subject: [PATCH 06/11] Pass iteration_number to metrics.log_metric as keyword argument. Fix bug where it was being passed to the timestamp positional argument. (#62) Overriding CI fail due to urgency and clear fix. (cherry picked from commit 41fdc8088b70769a592c13931a06e765615485b6) --- smdebug/core/hook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 8782ded1f..f4bd1f59e 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -569,7 +569,8 @@ def _write_scalars(self): write_tb = scalar_obj.write_tb write_event = scalar_obj.write_event if self.metrics_writer and sm_metric: - self.metrics_writer.log_metric(scalar_name, scalar_val, self.mode_steps[self.mode]) + self.metrics_writer.log_metric(scalar_name, scalar_val, + iteration_number=self.mode_steps[self.mode]) if write_tb: tb_writer = self._maybe_get_tb_writer() if tb_writer: From 6302a298c050f8b39649a5dca40b2b3e237c8527 Mon Sep 17 00:00:00 2001 From: Denis Davydenko Date: Fri, 29 Nov 2019 21:43:57 -0800 Subject: [PATCH 07/11] Update version (0.4.12 -> 0.4.13) --- smdebug/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/_version.py b/smdebug/_version.py index 9b084a609..4b2ce7df3 100644 --- a/smdebug/_version.py +++ b/smdebug/_version.py @@ -1 +1 @@ -__version__ = "0.4.12" +__version__ = "0.4.13" From 86f33836fe355cbc527fc916d02f62df897291b4 Mon Sep 17 00:00:00 2001 From: Denis Davydenko Date: Mon, 2 Dec 2019 12:31:48 -0800 Subject: [PATCH 08/11] Update NOTICE (cherry picked from commit 647f0005d3df452192d7565da76b7a0185bb0d6d) --- NOTICE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NOTICE b/NOTICE index 17ce381eb..740e5df1f 100644 --- a/NOTICE +++ b/NOTICE @@ -1,2 +1,2 @@ -Tornasole_core +Amazon SageMaker Debugger Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. From a3de06dab8c7039ff4392a5993a2c63dd3680942 Mon Sep 17 00:00:00 2001 From: Denis Davydenko Date: Mon, 2 Dec 2019 13:52:34 -0800 Subject: [PATCH 09/11] Create THIRD-PARTY (cherry picked from commit 5bfdbf439f966a86ea5904d527d9e39597fdfa97) --- THIRD-PARTY | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 THIRD-PARTY diff --git a/THIRD-PARTY b/THIRD-PARTY new file mode 100644 index 000000000..1a95df9c9 --- /dev/null +++ b/THIRD-PARTY @@ -0,0 +1,203 @@ +** Tensorboard; version 1.13.0 -- https://github.com/tensorflow/tensorboard +Copyright 2017 The TensorFlow Authors. All rights reserved. + +Apache License + +Version 2.0, January 2004 + +http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND +DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, and + distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by the + copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all other + entities that control, are controlled by, or are under common control + with that entity. For the purposes of this definition, "control" means + (i) the power, direct or indirect, to cause the direction or management + of such entity, whether by contract or otherwise, or (ii) ownership of + fifty percent (50%) or more of the outstanding shares, or (iii) + beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity exercising + permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation source, + and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but not limited + to compiled object code, generated documentation, and conversions to + other media types. + + "Work" shall mean the work of authorship, whether in Source or Object + form, made available under the License, as indicated by a copyright + notice that is included in or attached to the work (an example is + provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object form, + that is based on (or derived from) the Work and for which the editorial + revisions, annotations, elaborations, or other modifications represent, + as a whole, an original work of authorship. For the purposes of this + License, Derivative Works shall not include works that remain separable + from, or merely link (or bind by name) to the interfaces of, the Work and + Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including the original + version of the Work and any modifications or additions to that Work or + Derivative Works thereof, that is intentionally submitted to Licensor for + inclusion in the Work by the copyright owner or by an individual or Legal + Entity authorized to submit on behalf of the copyright owner. For the + purposes of this definition, "submitted" means any form of electronic, + verbal, or written communication sent to the Licensor or its + representatives, including but not limited to communication on electronic + mailing lists, source code control systems, and issue tracking systems + that are managed by, or on behalf of, the Licensor for the purpose of + discussing and improving the Work, but excluding communication that is + conspicuously marked or otherwise designated in writing by the copyright + owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity on + behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of this + License, each Contributor hereby grants to You a perpetual, worldwide, + non-exclusive, no-charge, royalty-free, irrevocable copyright license to + reproduce, prepare Derivative Works of, publicly display, publicly perform, + sublicense, and distribute the Work and such Derivative Works in Source or + Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of this + License, each Contributor hereby grants to You a perpetual, worldwide, + non-exclusive, no-charge, royalty-free, irrevocable (except as stated in + this section) patent license to make, have made, use, offer to sell, sell, + import, and otherwise transfer the Work, where such license applies only to + those patent claims licensable by such Contributor that are necessarily + infringed by their Contribution(s) alone or by combination of their + Contribution(s) with the Work to which such Contribution(s) was submitted. + If You institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work or a + Contribution incorporated within the Work constitutes direct or contributory + patent infringement, then any patent licenses granted to You under this + License for that Work shall terminate as of the date such litigation is + filed. + + 4. Redistribution. You may reproduce and distribute copies of the Work or + Derivative Works thereof in any medium, with or without modifications, and + in Source or Object form, provided that You meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative Works a + copy of this License; and + + (b) You must cause any modified files to carry prominent notices stating + that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works that You + distribute, all copyright, patent, trademark, and attribution notices + from the Source form of the Work, excluding those notices that do not + pertain to any part of the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must include + a readable copy of the attribution notices contained within such NOTICE + file, excluding those notices that do not pertain to any part of the + Derivative Works, in at least one of the following places: within a + NOTICE text file distributed as part of the Derivative Works; within the + Source form or documentation, if provided along with the Derivative + Works; or, within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents of the + NOTICE file are for informational purposes only and do not modify the + License. You may add Your own attribution notices within Derivative Works + that You distribute, alongside or as an addendum to the NOTICE text from + the Work, provided that such additional attribution notices cannot be + construed as modifying the License. + + You may add Your own copyright statement to Your modifications and may + provide additional or different license terms and conditions for use, + reproduction, or distribution of Your modifications, or for any such + Derivative Works as a whole, provided Your use, reproduction, and + distribution of the Work otherwise complies with the conditions stated in + this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, any + Contribution intentionally submitted for inclusion in the Work by You to the + Licensor shall be under the terms and conditions of this License, without + any additional terms or conditions. Notwithstanding the above, nothing + herein shall supersede or modify the terms of any separate license agreement + you may have executed with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, except + as required for reasonable and customary use in describing the origin of the + Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in + writing, Licensor provides the Work (and each Contributor provides its + Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied, including, without limitation, any + warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or + FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining + the appropriateness of using or redistributing the Work and assume any risks + associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, whether + in tort (including negligence), contract, or otherwise, unless required by + applicable law (such as deliberate and grossly negligent acts) or agreed to + in writing, shall any Contributor be liable to You for damages, including + any direct, indirect, special, incidental, or consequential damages of any + character arising as a result of this License or out of the use or inability + to use the Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all other + commercial damages or losses), even if such Contributor has been advised of + the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing the Work + or Derivative Works thereof, You may choose to offer, and charge a fee for, + acceptance of support, warranty, indemnity, or other liability obligations + and/or rights consistent with this License. However, in accepting such + obligations, You may act only on Your own behalf and on Your sole + responsibility, not on behalf of any other Contributor, and only if You + agree to indemnify, defend, and hold each Contributor harmless for any + liability incurred by, or claims asserted against, such Contributor by + reason of your accepting any such warranty or additional liability. END OF + TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification +within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); + +you may not use this file except in compliance with the License. + +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software + +distributed under the License is distributed on an "AS IS" BASIS, + +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and + +limitations under the License. + +* For Tensorboard see also this required NOTICE: + Copyright 2017 The TensorFlow Authors. All rights reserved. From 0187212fc3e8e93c48b5687b2e584d7335c42c08 Mon Sep 17 00:00:00 2001 From: Denis Davydenko Date: Mon, 2 Dec 2019 13:58:10 -0800 Subject: [PATCH 10/11] Update THIRD-PARTY (cherry picked from commit e18a8bffdbbdfaeb5f38a16ba2e048b2ec60161c) --- THIRD-PARTY | 2 ++ 1 file changed, 2 insertions(+) diff --git a/THIRD-PARTY b/THIRD-PARTY index 1a95df9c9..9bb026fce 100644 --- a/THIRD-PARTY +++ b/THIRD-PARTY @@ -1,6 +1,8 @@ ** Tensorboard; version 1.13.0 -- https://github.com/tensorflow/tensorboard Copyright 2017 The TensorFlow Authors. All rights reserved. +This project has been modified for use in Amazon SageMaker Debugger. + Apache License Version 2.0, January 2004 From b20b2e42730441b598eeadaaf9022d9ba8410aa7 Mon Sep 17 00:00:00 2001 From: Denis Davydenko Date: Mon, 2 Dec 2019 15:54:33 -0800 Subject: [PATCH 11/11] Mention modifications of original .proto files from TB. --- THIRD-PARTY => LICENSE-THIRD-PARTY | 0 setup.py | 1 + smdebug/core/tfevent/proto/attr_value.proto | 1 + smdebug/core/tfevent/proto/event.proto | 1 + smdebug/core/tfevent/proto/graph.proto | 1 + smdebug/core/tfevent/proto/node_def.proto | 1 + smdebug/core/tfevent/proto/resource_handle.proto | 1 + smdebug/core/tfevent/proto/summary.proto | 1 + smdebug/core/tfevent/proto/tensor.proto | 1 + smdebug/core/tfevent/proto/tensor_shape.proto | 1 + smdebug/core/tfevent/proto/types.proto | 1 + smdebug/core/tfevent/proto/versions.proto | 1 + 12 files changed, 11 insertions(+) rename THIRD-PARTY => LICENSE-THIRD-PARTY (100%) diff --git a/THIRD-PARTY b/LICENSE-THIRD-PARTY similarity index 100% rename from THIRD-PARTY rename to LICENSE-THIRD-PARTY diff --git a/setup.py b/setup.py index 8b39686ce..85fce806f 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ def build_package(version): setup_requires=["pytest-runner"], tests_require=TESTS_PACKAGES, python_requires=">=3.6", + license='Apache License Version 2.0' ) diff --git a/smdebug/core/tfevent/proto/attr_value.proto b/smdebug/core/tfevent/proto/attr_value.proto index 5fe54e43e..de7041229 100644 --- a/smdebug/core/tfevent/proto/attr_value.proto +++ b/smdebug/core/tfevent/proto/attr_value.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug; diff --git a/smdebug/core/tfevent/proto/event.proto b/smdebug/core/tfevent/proto/event.proto index 73a7f6ab5..a7c91943e 100644 --- a/smdebug/core/tfevent/proto/event.proto +++ b/smdebug/core/tfevent/proto/event.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug; diff --git a/smdebug/core/tfevent/proto/graph.proto b/smdebug/core/tfevent/proto/graph.proto index 1c5090ae7..dae5c33fa 100644 --- a/smdebug/core/tfevent/proto/graph.proto +++ b/smdebug/core/tfevent/proto/graph.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug; diff --git a/smdebug/core/tfevent/proto/node_def.proto b/smdebug/core/tfevent/proto/node_def.proto index b79617153..6fa628291 100644 --- a/smdebug/core/tfevent/proto/node_def.proto +++ b/smdebug/core/tfevent/proto/node_def.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug; diff --git a/smdebug/core/tfevent/proto/resource_handle.proto b/smdebug/core/tfevent/proto/resource_handle.proto index 6536161bb..7a20b9e61 100644 --- a/smdebug/core/tfevent/proto/resource_handle.proto +++ b/smdebug/core/tfevent/proto/resource_handle.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug; diff --git a/smdebug/core/tfevent/proto/summary.proto b/smdebug/core/tfevent/proto/summary.proto index 5a8c41b94..10206307f 100644 --- a/smdebug/core/tfevent/proto/summary.proto +++ b/smdebug/core/tfevent/proto/summary.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug; diff --git a/smdebug/core/tfevent/proto/tensor.proto b/smdebug/core/tfevent/proto/tensor.proto index 193df010a..56b87bbd5 100644 --- a/smdebug/core/tfevent/proto/tensor.proto +++ b/smdebug/core/tfevent/proto/tensor.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug; diff --git a/smdebug/core/tfevent/proto/tensor_shape.proto b/smdebug/core/tfevent/proto/tensor_shape.proto index e7c87d666..91c502032 100644 --- a/smdebug/core/tfevent/proto/tensor_shape.proto +++ b/smdebug/core/tfevent/proto/tensor_shape.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ // Protocol buffer representing the shape of tensors. syntax = "proto3"; diff --git a/smdebug/core/tfevent/proto/types.proto b/smdebug/core/tfevent/proto/types.proto index 15cea1995..df7e65f05 100644 --- a/smdebug/core/tfevent/proto/types.proto +++ b/smdebug/core/tfevent/proto/types.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug; diff --git a/smdebug/core/tfevent/proto/versions.proto b/smdebug/core/tfevent/proto/versions.proto index c27a8d91a..80ed94c45 100644 --- a/smdebug/core/tfevent/proto/versions.proto +++ b/smdebug/core/tfevent/proto/versions.proto @@ -1,3 +1,4 @@ +/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */ syntax = "proto3"; package smdebug;