From 0291535f006efa61405cbbcff495e531600c14e0 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Thu, 21 Nov 2019 16:06:51 -0800
Subject: [PATCH 01/11] Update version

---
 smdebug/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smdebug/_version.py b/smdebug/_version.py
index 793c25142..a3a9bd544 100644
--- a/smdebug/_version.py
+++ b/smdebug/_version.py
@@ -1 +1 @@
-__version__ = "0.5a"
+__version__ = "0.4.8"

From 50940b8aedc673f5c7fcfff0f38302fe6388e1fa Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Fri, 22 Nov 2019 19:23:01 -0800
Subject: [PATCH 02/11] Bump up version to 0.4.10, skipping 0.4.9 due to
 confusion of reverting release

---
 smdebug/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smdebug/_version.py b/smdebug/_version.py
index a3a9bd544..805e7c470 100644
--- a/smdebug/_version.py
+++ b/smdebug/_version.py
@@ -1 +1 @@
-__version__ = "0.4.8"
+__version__ = "0.4.10"

From 4b79a0fc4c46c82e0e7059cffa8733b8e9e52fcb Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Sat, 23 Nov 2019 05:43:08 +0000
Subject: [PATCH 03/11] Get integration tests working again without patch hacks

---
 .../tensorflow_integration_tests.py           |   2 -
 tests/zero_code_change/tests/__init__.py      |   0
 .../hooks/test_mirrored_strategy.py           | 493 +++++++++++++++
 .../tensorflow/keras/test_keras_mirrored.py   | 560 ++++++++++++++++++
 .../tests/tensorflow/utils.py                 |   8 +
 tests/zero_code_change/tests_path.py          |   6 -
 6 files changed, 1061 insertions(+), 8 deletions(-)
 create mode 100644 tests/zero_code_change/tests/__init__.py
 create mode 100644 tests/zero_code_change/tests/tensorflow/hooks/test_mirrored_strategy.py
 create mode 100644 tests/zero_code_change/tests/tensorflow/keras/test_keras_mirrored.py
 create mode 100644 tests/zero_code_change/tests/tensorflow/utils.py
 delete mode 100644 tests/zero_code_change/tests_path.py

diff --git a/tests/zero_code_change/tensorflow_integration_tests.py b/tests/zero_code_change/tensorflow_integration_tests.py
index 2e8d3f867..a6b2bcbc1 100644
--- a/tests/zero_code_change/tensorflow_integration_tests.py
+++ b/tests/zero_code_change/tensorflow_integration_tests.py
@@ -14,8 +14,6 @@
 We check that certain tensors are saved.
 Here in the test suite we delete the hook after every script.
 """
-from tests_path import *  # isort:skip
-
 # Standard Library
 import argparse
 
diff --git a/tests/zero_code_change/tests/__init__.py b/tests/zero_code_change/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/zero_code_change/tests/tensorflow/hooks/test_mirrored_strategy.py b/tests/zero_code_change/tests/tensorflow/hooks/test_mirrored_strategy.py
new file mode 100644
index 000000000..2fa105bfb
--- /dev/null
+++ b/tests/zero_code_change/tests/tensorflow/hooks/test_mirrored_strategy.py
@@ -0,0 +1,493 @@
+#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Convolutional Neural Network Estimator for MNIST, built with tf.layers."""
+
+# Future
+from __future__ import absolute_import, division, print_function
+
+# Third Party
+import numpy as np
+import pytest
+import tensorflow as tf
+from tensorflow.python.client import device_lib
+from tests.tensorflow.utils import create_trial_fast_refresh
+
+# First Party
+import smdebug.tensorflow as smd
+from smdebug.core.collection import CollectionKeys
+from smdebug.core.modes import ModeKeys
+from smdebug.exceptions import TensorUnavailableForStep
+from smdebug.tensorflow import get_hook
+
+
+def cnn_model_fn(features, labels, mode):
+    """Model function for CNN."""
+    # Input Layer
+    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
+    # MNIST images are 28x28 pixels, and have one color channel
+    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
+
+    # Convolutional Layer #1
+    # Computes 32 features using a 5x5 filter with ReLU activation.
+    # Padding is added to preserve width and height.
+    # Input Tensor Shape: [batch_size, 28, 28, 1]
+    # Output Tensor Shape: [batch_size, 28, 28, 32]
+    conv1 = tf.layers.conv2d(
+        inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu
+    )
+
+    # Pooling Layer #1
+    # First max pooling layer with a 2x2 filter and stride of 2
+    # Input Tensor Shape: [batch_size, 28, 28, 32]
+    # Output Tensor Shape: [batch_size, 14, 14, 32]
+    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
+
+    # Convolutional Layer #2
+    # Computes 64 features using a 5x5 filter.
+    # Padding is added to preserve width and height.
+    # Input Tensor Shape: [batch_size, 14, 14, 32]
+    # Output Tensor Shape: [batch_size, 14, 14, 64]
+    conv2 = tf.layers.conv2d(
+        inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu
+    )
+
+    # Pooling Layer #2
+    # Second max pooling layer with a 2x2 filter and stride of 2
+    # Input Tensor Shape: [batch_size, 14, 14, 64]
+    # Output Tensor Shape: [batch_size, 7, 7, 64]
+    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
+
+    # Flatten tensor into a batch of vectors
+    # Input Tensor Shape: [batch_size, 7, 7, 64]
+    # Output Tensor Shape: [batch_size, 7 * 7 * 64]
+    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
+
+    # Dense Layer
+    # Densely connected layer with 1024 neurons
+    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
+    # Output Tensor Shape: [batch_size, 1024]
+    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
+
+    # Add dropout operation; 0.6 probability that element will be kept
+    dropout = tf.layers.dropout(
+        inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN
+    )
+
+    # Logits layer
+    # Input Tensor Shape: [batch_size, 1024]
+    # Output Tensor Shape: [batch_size, 10]
+    logits = tf.layers.dense(inputs=dropout, units=10)
+
+    predictions = {
+        # Generate predictions (for PREDICT and EVAL mode)
+        "classes": tf.argmax(input=logits, axis=1),
+        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
+        # `logging_hook`.
+        "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
+    }
+    if mode == tf.estimator.ModeKeys.PREDICT:
+        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
+
+    # Calculate Loss (for both TRAIN and EVAL modes)
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+
+    # Configure the Training Op (for TRAIN mode)
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+        optimizer = smd.get_hook().wrap_optimizer(optimizer)
+        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
+        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+
+    # Add evaluation metrics (for EVAL mode)
+    eval_metric_ops = {
+        "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
+    }
+    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
+
+
+def per_device_batch_size(batch_size, num_gpus):
+    """For multi-gpu, batch-size must be a multiple of the number of GPUs.
+    Note that this should eventually be handled by DistributionStrategies
+    directly. Multi-GPU support is currently experimental, however,
+    so doing the work here until that feature is in place.
+    Args:
+      batch_size: Global batch size to be divided among devices. This should be
+        equal to num_gpus times the single-GPU batch_size for multi-gpu training.
+      num_gpus: How many GPUs are used with DistributionStrategies.
+    Returns:
+      Batch size per device.
+    Raises:
+      ValueError: if batch_size is not divisible by number of devices
+    """
+    if num_gpus <= 1:
+        return batch_size
+
+    remainder = batch_size % num_gpus
+    if remainder:
+        err = (
+            "When running with multiple GPUs, batch size "
+            "must be a multiple of the number of available GPUs. Found {} "
+            "GPUs with a batch size of {}; try --batch_size={} instead."
+        ).format(num_gpus, batch_size, batch_size - remainder)
+        raise ValueError(err)
+    return int(batch_size / num_gpus)
+
+
+class InputFnProvider:
+    def __init__(self, train_batch_size):
+        self.train_batch_size = train_batch_size
+        self.__load_data()
+
+    def __load_data(self):
+        # Load training and eval data
+        mnist = tf.contrib.learn.datasets.load_dataset("mnist")
+        self.train_data = mnist.train.images  # Returns np.array
+        self.train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
+        self.eval_data = mnist.test.images  # Returns np.array
+        self.eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
+
+    def train_input_fn(self):
+        """An input function for training"""
+        # Shuffle, repeat, and batch the examples.
+        dataset = tf.data.Dataset.from_tensor_slices(({"x": self.train_data}, self.train_labels))
+        dataset = dataset.shuffle(1000).repeat().batch(self.train_batch_size)
+        return dataset
+
+    def eval_input_fn(self):
+        """An input function for evaluation or prediction"""
+        dataset = tf.data.Dataset.from_tensor_slices(({"x": self.eval_data}, self.eval_labels))
+        dataset = dataset.batch(1).repeat()
+        return dataset
+
+
+def get_available_gpus():
+    local_device_protos = device_lib.list_local_devices()
+    return len([x.name for x in local_device_protos if x.device_type == "GPU"])
+
+
+def helper_mirrored(
+    trial_dir,
+    save_all=False,
+    num_steps=3,
+    save_config=None,
+    reduction_config=None,
+    include_collections=None,
+    steps=None,
+    zcc=False,
+    eval_distributed=False,
+    include_workers="all",
+):
+    num_gpus = get_available_gpus()
+    num_devices = num_gpus if num_gpus > 0 else 1
+    batch_size = 10 * num_devices
+
+    # input_fn which serves Dataset
+    input_fn_provider = InputFnProvider(per_device_batch_size(batch_size, num_devices))
+
+    # Use multiple GPUs by MirroredStragtegy.
+    # All avaiable GPUs will be used if `num_gpus` is omitted.
+    # if num_devices > 1:
+    distribution = tf.contrib.distribute.MirroredStrategy()
+    # print("### Doing Multi GPU Training")
+    # else:
+    #     distribution = None
+    # Pass to RunConfig
+    config = tf.estimator.RunConfig(
+        train_distribute=distribution,
+        eval_distribute=distribution if eval_distributed else None,
+        model_dir="/tmp/mnist_convnet_model",
+    )
+
+    if save_config is None:
+        save_config = smd.SaveConfig(save_interval=2)
+
+    if include_collections is None:
+        include_collections = [
+            CollectionKeys.WEIGHTS,
+            CollectionKeys.BIASES,
+            CollectionKeys.GRADIENTS,
+            CollectionKeys.LOSSES,
+        ]
+
+    if not zcc:
+        ts_hook = smd.SessionHook(
+            out_dir=trial_dir,
+            save_all=save_all,
+            include_collections=include_collections,
+            save_config=save_config,
+            reduction_config=reduction_config,
+            include_workers=include_workers,
+        )
+    else:
+        print("zcc is passed. ignoring include_collections and save_config")
+
+    mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, config=config)
+    if steps is None:
+        steps = ["train"]
+
+    for s in steps:
+        if s == "train":
+            print("Starting train")
+            if not zcc:
+                ts_hook.set_mode(smd.modes.TRAIN)
+                # Train the model
+                mnist_classifier.train(
+                    input_fn=input_fn_provider.train_input_fn, steps=num_steps, hooks=[ts_hook]
+                )
+            else:
+                mnist_classifier.train(input_fn=input_fn_provider.train_input_fn, steps=num_steps)
+        elif s == "eval":
+            print("Starting eval")
+
+            if not zcc:
+                ts_hook.set_mode(smd.modes.EVAL)
+                # Evaluate the model and print results
+                mnist_classifier.evaluate(
+                    input_fn=input_fn_provider.eval_input_fn, steps=num_steps, hooks=[ts_hook]
+                )
+            else:
+                mnist_classifier.evaluate(input_fn=input_fn_provider.eval_input_fn, steps=num_steps)
+        elif s == "predict":
+            print("Starting predict")
+            if not zcc:
+                ts_hook.set_mode(smd.modes.PREDICT)
+                # Evaluate the model and print results
+                p = mnist_classifier.predict(
+                    input_fn=input_fn_provider.eval_input_fn, hooks=[ts_hook]
+                )
+            else:
+                p = mnist_classifier.predict(input_fn=input_fn_provider.eval_input_fn)
+            for i in range(num_steps):
+                next(p)
+    get_hook()._cleanup()
+    return distribution
+
+
+def skip_trial_check():
+    # Skip trial check as in this case SMDebug is disabled for mirrored strategy
+    # trial will not be loaded
+    import tensorflow as tf
+    from packaging import version
+
+    if version.parse(tf.__version__) < version.parse("1.14.0"):
+        return True
+    else:
+        return False
+
+
+@pytest.mark.slow
+def test_basic(out_dir, zcc=False):
+    strategy = helper_mirrored(
+        out_dir,
+        steps=["train", "eval", "predict", "train"],
+        include_collections=[
+            CollectionKeys.WEIGHTS,
+            CollectionKeys.BIASES,
+            CollectionKeys.GRADIENTS,
+            CollectionKeys.LOSSES,
+        ],
+        eval_distributed=False,
+        zcc=zcc,
+    )
+    if skip_trial_check():
+        return
+
+    tr = create_trial_fast_refresh(out_dir)
+    # wts, grads, losses
+    print(tr.tensors())
+    assert len(tr.tensors()) == 8 + 8 + (1 * strategy.num_replicas_in_sync) + 1
+    assert len(tr.steps()) == 7
+    assert len(tr.steps(ModeKeys.TRAIN)) == 3
+    assert len(tr.steps(ModeKeys.EVAL)) == 2
+    assert len(tr.steps(ModeKeys.PREDICT)) == 2
+
+    assert "dense_1/kernel:0" in tr.tensors(collection="weights")
+    for tname in tr.tensors(collection="weights"):
+        for s in tr.tensor(tname).steps(ModeKeys.TRAIN):
+            assert len(tr.tensor(tname).workers(s, ModeKeys.TRAIN)) == strategy.num_replicas_in_sync
+            for worker in tr.tensor(tname).workers(s, ModeKeys.TRAIN):
+                assert tr.tensor(tname).value(s, worker=worker, mode=ModeKeys.TRAIN) is not None
+        for s in tr.tensor(tname).steps(ModeKeys.EVAL):
+            assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == strategy.num_replicas_in_sync
+            assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None
+
+    tensornames = tr.tensors(regex="Identity_\d+:0")
+    for s in tr.tensor(tensornames[0]).steps(ModeKeys.TRAIN):
+        for w in tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN):
+            assert tr.tensor(tensornames[0]).value(s, worker=w, mode=ModeKeys.TRAIN) is not None
+        assert (
+            len(tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN))
+            == strategy.num_replicas_in_sync
+        )
+
+    for tname in tr.tensors(collection="losses"):
+        if tname != tensornames[0]:
+            for s in tr.tensor(tname).steps(ModeKeys.TRAIN):
+                assert len(tr.tensor(tname).workers(s, ModeKeys.TRAIN)) == 1
+                assert tr.tensor(tname).value(s, mode=ModeKeys.TRAIN) is not None
+
+    tname = "sparse_softmax_cross_entropy_loss/value:0"
+    for s in tr.tensor(tname).steps(ModeKeys.EVAL):
+        assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == strategy.num_replicas_in_sync
+        assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None
+
+
+@pytest.mark.slow
+def test_eval_distributed(out_dir):
+    strategy = helper_mirrored(
+        out_dir,
+        steps=["train", "eval"],
+        include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.LOSSES],
+        eval_distributed=True,
+    )
+    if skip_trial_check():
+        return
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.tensors()) == 8 + 1 * strategy.num_replicas_in_sync + 1
+    assert len(tr.steps()) == 4
+    assert len(tr.steps(ModeKeys.TRAIN)) == 2
+    assert len(tr.steps(ModeKeys.EVAL)) == 2
+
+    for tname in tr.tensors(collection="weights"):
+        for s in tr.tensor(tname).steps(ModeKeys.TRAIN):
+            assert len(tr.tensor(tname).workers(s, ModeKeys.TRAIN)) == strategy.num_replicas_in_sync
+            for worker in tr.tensor(tname).workers(s, ModeKeys.TRAIN):
+                assert tr.tensor(tname).value(s, worker=worker, mode=ModeKeys.TRAIN) is not None
+        for s in tr.tensor(tname).steps(ModeKeys.EVAL):
+            assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == strategy.num_replicas_in_sync
+            assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None
+
+    tensornames = tr.tensors(regex="Identity_\d+:0")
+    for s in tr.tensor(tensornames[0]).steps(ModeKeys.TRAIN):
+        for w in tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN):
+            assert tr.tensor(tensornames[0]).value(s, worker=w, mode=ModeKeys.TRAIN) is not None
+        assert (
+            len(tr.tensor(tensornames[0]).workers(s, ModeKeys.TRAIN))
+            == strategy.num_replicas_in_sync
+        )
+
+    for tname in tr.tensors(collection="losses"):
+        for s in tr.tensor(tname).steps(ModeKeys.EVAL):
+            assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == 1
+            assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None
+        if tname != tensornames[0]:
+            for s in tr.tensor(tname).steps(ModeKeys.TRAIN):
+                assert len(tr.tensor(tname).workers(s, ModeKeys.EVAL)) == 1
+                assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None
+
+
+@pytest.mark.slow
+def test_reductions(out_dir):
+    strategy = helper_mirrored(
+        out_dir,
+        steps=["train", "eval"],
+        reduction_config=smd.ReductionConfig(
+            reductions=["sum", "max"], abs_reductions=["sum", "max"], norms=["l1"]
+        ),
+        include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.LOSSES],
+        eval_distributed=True,
+    )
+    if skip_trial_check():
+        return
+
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.tensors()) == 8 + 1 * strategy.num_replicas_in_sync + 1
+    assert len(tr.steps()) == 4
+    assert len(tr.steps(ModeKeys.TRAIN)) == 2
+    assert len(tr.steps(ModeKeys.EVAL)) == 2
+
+    for tname in tr.tensors(collection="weights"):
+        for s in tr.tensor(tname).steps(ModeKeys.TRAIN):
+            try:
+                tr.tensor(tname).value(s, mode=ModeKeys.TRAIN)
+                assert False
+            except TensorUnavailableForStep:
+                # for some tensors l1 reduction can't be saved due to improper dimensions for the reduction
+                assert len(tr.tensor(tname).reduction_values(s, mode=ModeKeys.TRAIN)) >= 4
+
+        for s in tr.tensor(tname).steps(ModeKeys.EVAL):
+            try:
+                tr.tensor(tname).value(s, mode=ModeKeys.EVAL)
+                assert False
+            except TensorUnavailableForStep:
+                # for some tensors l1 reduction can't be saved due to improper dimensions for the reduction
+                assert len(tr.tensor(tname).reduction_values(s, mode=ModeKeys.EVAL)) >= 4
+
+    for tname in tr.tensors(collection="losses"):
+        for s in tr.tensor(tname).steps(ModeKeys.EVAL):
+            assert len(tr.tensor(tname).reduction_values(s, mode=ModeKeys.EVAL)) == 0
+            assert tr.tensor(tname).value(s, mode=ModeKeys.EVAL) is not None
+
+    for tname in tr.tensors(collection="losses"):
+        for s in tr.tensor(tname).steps(ModeKeys.TRAIN):
+            assert len(tr.tensor(tname).reduction_values(s, mode=ModeKeys.TRAIN)) == 0
+            assert tr.tensor(tname).value(s, mode=ModeKeys.TRAIN) is not None
+
+
+@pytest.mark.slow
+def test_save_all(out_dir):
+    strategy = helper_mirrored(
+        out_dir, steps=["train"], num_steps=1, save_all=True, eval_distributed=True
+    )
+    if skip_trial_check():
+        return
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.tensors()) > 100
+    assert len(tr.steps())
+    assert len(tr.tensors(collection="weights"))
+    assert len(tr.tensors(collection="biases"))
+    assert len(tr.tensors(collection="gradients"))
+
+
+@pytest.mark.slow
+def test_save_all_worker(out_dir):
+    # skip test if no gpus available
+    if get_available_gpus() == 0:
+        return
+    strategy = helper_mirrored(
+        out_dir,
+        steps=["train"],
+        num_steps=1,
+        save_all=True,
+        eval_distributed=True,
+        include_workers="all",
+    )
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.steps())
+    assert len(tr.workers()) == get_available_gpus()
+    assert len(tr.tensors(collection="weights"))
+    assert "conv2d/kernel:0" in tr.tensors(collection="weights")
+    assert len(tr.tensor("conv2d/kernel:0").workers(0)) == strategy.num_replicas_in_sync
+    assert len(tr.tensors(collection="biases"))
+    assert "conv2d/bias:0" in tr.tensors(collection="biases")
+    assert len(tr.tensor("conv2d/bias:0").workers(0)) == strategy.num_replicas_in_sync
+    assert len(tr.tensors(collection="gradients"))
+
+
+@pytest.mark.slow
+def test_save_one_worker(out_dir):
+    strategy = helper_mirrored(
+        out_dir,
+        steps=["train"],
+        num_steps=1,
+        save_all=True,
+        eval_distributed=True,
+        include_workers="one",
+    )
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.workers()) == 1
+    assert len(tr.steps())
+    assert len(tr.tensors(collection="weights"))
+    assert len(tr.tensors(collection="biases"))
+    assert len(tr.tensors(collection="gradients"))
diff --git a/tests/zero_code_change/tests/tensorflow/keras/test_keras_mirrored.py b/tests/zero_code_change/tests/tensorflow/keras/test_keras_mirrored.py
new file mode 100644
index 000000000..71b5e6923
--- /dev/null
+++ b/tests/zero_code_change/tests/tensorflow/keras/test_keras_mirrored.py
@@ -0,0 +1,560 @@
+# Future
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# Standard Library
+import os
+
+# Third Party
+import pytest
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow.python.client import device_lib
+from tests.tensorflow.utils import create_trial_fast_refresh
+
+# First Party
+import smdebug.tensorflow as smd
+from smdebug.core.access_layer import has_training_ended
+from smdebug.core.collection import CollectionKeys
+from smdebug.core.modes import ModeKeys
+from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
+from smdebug.exceptions import TensorUnavailable, TensorUnavailableForStep
+from smdebug.tensorflow import ReductionConfig, SaveConfig
+from smdebug.tensorflow.keras import KerasHook
+
+tfds.disable_progress_bar()
+
+
+class FetchTensorCallback(tf.keras.callbacks.Callback):
+    def __init__(self, tensors):
+        self.tensors = tensors
+        self.fetches_added = False
+
+    def _callback_fn(self, tensor_val):
+        assert tensor_val is not None
+
+    def on_train_batch_begin(self, batch, logs):
+        try:
+            from tensorflow.python.keras.distribute.distributed_training_utils import (
+                get_distributed_model,
+            )
+            from tensorflow.python.keras.utils.mode_keys import ModeKeys as KerasModeKeys
+
+            for t in self.tensors:
+                x = get_distributed_model(self.model, KerasModeKeys.TRAIN)._distributed_function
+                x.fetches.append(t)
+                x.fetch_callbacks[t] = self._callback_fn
+            self.fetches_added = True
+        except ImportError:
+            pass
+
+    def on_train_batch_end(self, batch, logs):
+        if self.fetches_added:
+            # these should only be added if these were available above
+            from tensorflow.python.keras.distribute.distributed_training_utils import (
+                get_distributed_model,
+            )
+            from tensorflow.python.keras.utils.mode_keys import ModeKeys as KerasModeKeys
+
+            for t in self.tensors:
+                x = get_distributed_model(self.model, KerasModeKeys.TRAIN)._distributed_function
+                x.fetches.remove(t)
+                del x.fetch_callbacks[t]
+            self.fetches_added = False
+
+
+def get_available_gpus():
+    local_device_protos = device_lib.list_local_devices()
+    return len([x.name for x in local_device_protos if x.device_type == "GPU"])
+
+
+def train_model(
+    trial_dir,
+    save_all=False,
+    hook=None,
+    include_collections=None,
+    reduction_config=None,
+    save_config=None,
+    use_keras_optimizer=True,
+    eager=False,
+    create_relu_collection=False,
+    strategy=None,
+    steps=None,
+    add_callbacks=None,
+    zcc=False,
+    include_workers="all",
+):
+    print(tf.__version__)
+    tf.keras.backend.clear_session()
+
+    datasets, info = tfds.load(name="mnist", with_info=True, as_supervised=True)
+
+    mnist_train, mnist_test = datasets["train"], datasets["test"]
+
+    if strategy is None:
+        strategy = tf.distribute.MirroredStrategy()
+
+    # You can also do info.splits.total_num_examples to get the total
+    # number of examples in the dataset.
+
+    BUFFER_SIZE = 10000
+
+    BATCH_SIZE_PER_REPLICA = 64
+    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
+
+    def scale(image, label):
+        image = tf.cast(image, tf.float32)
+        image /= 255
+
+        return image, label
+
+    train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
+    eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)
+
+    if hook is None and not zcc:
+        if save_config is None:
+            save_config = SaveConfig(save_interval=3)
+
+        hook = KerasHook(
+            out_dir=trial_dir,
+            save_config=save_config,
+            reduction_config=reduction_config,
+            include_collections=include_collections,
+            save_all=save_all,
+            include_workers=include_workers,
+        )
+
+        if not save_all and include_collections is not None:
+            for cname in hook.include_collections:
+                if cname not in include_collections:
+                    hook.get_collection(cname).save_config = SaveConfig(end_step=0)
+
+    if use_keras_optimizer:
+        opt = tf.keras.optimizers.Adam()
+    else:
+        opt = tf.train.AdamOptimizer(0.1)
+
+    if not zcc:
+        opt = hook.wrap_optimizer(opt)
+
+    with strategy.scope():
+        relu_layer = tf.keras.layers.Dense(64, activation="relu")
+        model = tf.keras.Sequential(
+            [
+                tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)),
+                tf.keras.layers.MaxPooling2D(),
+                tf.keras.layers.Flatten(),
+                relu_layer,
+                tf.keras.layers.Dense(10, activation="softmax"),
+            ]
+        )
+        model.compile(
+            loss="sparse_categorical_crossentropy",
+            optimizer=opt,
+            run_eagerly=eager,
+            metrics=["accuracy"],
+        )
+
+    if create_relu_collection:
+        hook.get_collection("relu").add_keras_layer(relu_layer, inputs=True, outputs=True)
+
+    hooks = []
+    if add_callbacks:
+        if "tensorboard" in add_callbacks:
+            hooks.append(
+                # write_grads = True causes crash saying handle must be created in scope
+                # erorr like this https://stackoverflow.com/questions/56836895/custom-training-loop-using-tensorflow-gpu-1-14-and-tf-distribute-mirroredstrateg
+                # this crash is even if tornasole callback is off
+                tf.keras.callbacks.TensorBoard(
+                    log_dir="./logs", histogram_freq=4, write_images=True
+                )
+            )
+        if "fetch_tensor" in add_callbacks:
+            hooks.append(FetchTensorCallback(model.weights))
+    if not zcc:
+        hooks.append(hook)
+
+    if steps is None:
+        steps = ["train"]
+    for step in steps:
+        if step == "train":
+            model.fit(train_dataset, epochs=1, steps_per_epoch=10, callbacks=hooks, verbose=0)
+        elif step == "eval":
+            model.evaluate(eval_dataset, steps=10, callbacks=hooks, verbose=0)
+        elif step == "predict":
+            model.predict(train_dataset, steps=4, callbacks=hooks, verbose=0)
+
+    smd.get_hook()._cleanup()
+    return strategy
+
+
+@pytest.mark.skip(
+    "needs to be run individually as it complains that eager "
+    "needs to be set at startup, but pytest "
+    "does not allow controlling order of tests"
+)
+def test_tf_keras_eager(out_dir):
+    tf.enable_eager_execution()
+    train_model(out_dir, eager=True, steps=["train"])
+    tf.disable_eager_execution()
+
+
+@pytest.mark.skip(
+    "needs to be run individually as it complains that eager "
+    "needs to be set at startup, but pytest "
+    "does not allow controlling order of tests"
+)
+def test_tf_keras_eager_env(out_dir):
+    tf.enable_eager_execution()
+    train_model(out_dir, eager=False, steps=["train"])
+    tf.disable_eager_execution()
+
+
+def exhaustive_check(trial_dir, zcc=False, include_workers="one"):
+    include_collections = [
+        CollectionKeys.WEIGHTS,
+        CollectionKeys.BIASES,
+        CollectionKeys.GRADIENTS,
+        CollectionKeys.LOSSES,
+        CollectionKeys.OUTPUTS,
+        CollectionKeys.METRICS,
+        CollectionKeys.OPTIMIZER_VARIABLES,
+    ]
+    strategy = train_model(
+        trial_dir,
+        include_collections=include_collections,
+        steps=["train", "eval", "predict", "train"],
+        include_workers=include_workers,
+        zcc=zcc,
+    )
+
+    tr = create_trial_fast_refresh(trial_dir)
+    print(tr.tensors())
+
+    if include_workers == "all":
+        assert len(tr.workers()) == strategy.num_replicas_in_sync
+        assert len(tr.tensors()) == (6 + 6 + 1 + 3 + strategy.num_replicas_in_sync * 3 + 5)
+    else:
+        assert len(tr.workers()) == 1
+        assert len(tr.tensors()) == (6 + 6 + 1 + 3 + 1 * 3 + 5)
+
+    # 6 weights, 6 gradients, 1 loss, 3 metrics, 24 outputs (8 for each mode), 5 optimizer variables
+    assert len(tr.modes()) == 3
+    assert len(tr.steps()) == 14
+    assert len(tr.steps(ModeKeys.TRAIN)) == 8  # 0, 3, 6, 9, 12, 15, 18, 19(end of epoch)
+    assert len(tr.steps(ModeKeys.EVAL)) == 4
+    assert len(tr.steps(ModeKeys.PREDICT)) == 2  # ran 4 steps above
+
+    assert len(tr.tensors(collection=CollectionKeys.BIASES)) == 3
+    wtnames = tr.tensors(collection=CollectionKeys.WEIGHTS)
+    assert len(wtnames) == 3
+
+    for wtname in wtnames:
+        assert len(tr.tensor(wtname).steps()) == 13, wtname
+        assert len(tr.tensor(wtname).steps(ModeKeys.TRAIN)) == 7
+        for s in tr.tensor(wtname).steps(ModeKeys.TRAIN):
+            assert tr.tensor(wtname).value(s, mode=ModeKeys.TRAIN) is not None
+            for worker in tr.workers():
+                assert tr.tensor(wtname).value(s, mode=ModeKeys.TRAIN, worker=worker) is not None
+        assert len(tr.tensor(wtname).steps(ModeKeys.EVAL)) == 4
+        for s in tr.tensor(wtname).steps(ModeKeys.EVAL):
+            assert tr.tensor(wtname).value(s, mode=ModeKeys.EVAL) is not None
+            for worker in tr.workers():
+                assert tr.tensor(wtname).value(s, mode=ModeKeys.EVAL, worker=worker) is not None
+        assert len(tr.tensor(wtname).steps(ModeKeys.PREDICT)) == 2
+
+    gradnames = tr.tensors(collection=CollectionKeys.GRADIENTS)
+    assert len(gradnames) == 6
+    for gradname in gradnames:
+        assert len(tr.tensor(gradname).steps(ModeKeys.TRAIN)) == 7
+        for s in tr.tensor(gradname).steps(ModeKeys.TRAIN):
+            assert tr.tensor(gradname).value(s, mode=ModeKeys.TRAIN) is not None
+        assert len(tr.tensor(gradname).steps(ModeKeys.EVAL)) == 0
+        assert len(tr.tensor(gradname).steps(ModeKeys.PREDICT)) == 0
+
+    optvarnames = tr.tensors(collection=CollectionKeys.OPTIMIZER_VARIABLES)
+    assert len(optvarnames) == 5
+    for optvarname in optvarnames:
+        assert len(tr.tensor(optvarname).steps(ModeKeys.TRAIN)) == 7
+        for s in tr.tensor(optvarname).steps(ModeKeys.TRAIN):
+            assert tr.tensor(optvarname).value(s, mode=ModeKeys.TRAIN) is not None
+        assert len(tr.tensor(optvarname).steps(ModeKeys.EVAL)) == 0
+        assert len(tr.tensor(optvarname).steps(ModeKeys.PREDICT)) == 0
+
+    assert len(tr.tensors(collection=CollectionKeys.LOSSES)) == 1
+    loss_name = tr.tensors(collection=CollectionKeys.LOSSES)[0]
+    # loss is not in predict mode (so less 2)
+    # add one for end of epoch
+    assert len(tr.tensor(loss_name).steps(ModeKeys.TRAIN)) == 8
+    assert len(tr.tensor(loss_name).steps(ModeKeys.EVAL)) == 4
+    assert len(tr.tensor(loss_name).steps(ModeKeys.PREDICT)) == 0
+    assert len(tr.tensor(loss_name).steps()) == 12
+
+    metricnames = tr.tensors(collection=CollectionKeys.METRICS)
+    assert len(metricnames) == 3
+
+
+@pytest.mark.slow
+def test_tf_keras(out_dir, zcc=False, include_workers="all"):
+    exhaustive_check(out_dir, zcc=zcc, include_workers=include_workers)
+
+
+@pytest.mark.slow
+def test_tf_keras_non_keras_opt(out_dir):
+    include_collections = [
+        CollectionKeys.GRADIENTS,
+        CollectionKeys.OPTIMIZER_VARIABLES,
+        CollectionKeys.METRICS,
+    ]
+    train_model(
+        out_dir,
+        include_collections=include_collections,
+        use_keras_optimizer=False,
+        steps=["train", "eval"],
+    )
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.modes()) == 2
+    assert len(tr.steps(ModeKeys.TRAIN)) == 4  # 0, 3, 6, 9
+    assert len(tr.tensors(collection=CollectionKeys.GRADIENTS)) == 6
+    gradient_name = tr.tensors(collection=CollectionKeys.GRADIENTS)[0]
+    assert len(tr.tensor(gradient_name).steps(ModeKeys.TRAIN)) == 4
+    assert len(tr.tensor(gradient_name).steps(ModeKeys.EVAL)) == 0
+
+    # not supported for non keras optimizer with keras
+    assert len(tr.tensors(collection=CollectionKeys.OPTIMIZER_VARIABLES)) == 0
+
+
+@pytest.mark.slow
+def test_save_all(out_dir):
+    strategy = train_model(
+        out_dir,
+        include_collections=None,
+        save_all=True,
+        save_config=SaveConfig(save_steps=[5]),
+        steps=["train"],
+    )
+    tr = create_trial_fast_refresh(out_dir)
+    print(tr.tensors())
+    assert (
+        len(tr.tensors())
+        == 6 + 6 + 5 + 3 + 1 + 3 * strategy.num_replicas_in_sync + 2 * strategy.num_replicas_in_sync
+    )
+    # weights, grads, optimizer_variables, metrics, losses, outputs
+    assert len(tr.steps()) == 3
+
+
+@pytest.mark.slow
+def test_save_one_worker(out_dir):
+    strategy = train_model(
+        out_dir,
+        include_collections=None,
+        save_all=True,
+        save_config=SaveConfig(save_steps=[5]),
+        steps=["train"],
+        include_workers="one",
+    )
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.workers()) == 1
+    assert len(tr.steps())
+    assert len(tr.tensors(collection="weights"))
+    assert len(tr.tensors(collection="weights"))
+    assert len(tr.tensor(tr.tensors(collection="weights")[0]).workers(0)) == 1
+    assert len(tr.tensors(collection="biases"))
+    assert len(tr.tensor(tr.tensors(collection="biases")[0]).workers(0)) == 1
+    assert len(tr.tensors(collection="gradients"))
+
+
+@pytest.mark.slow
+def test_save_all_workers(out_dir, zcc=False):
+    # Skip if no GPUS
+    if get_available_gpus() == 0:
+        return
+    strategy = train_model(
+        out_dir,
+        include_collections=None,
+        save_all=True,
+        save_config=SaveConfig(save_steps=[5]),
+        steps=["train"],
+        include_workers="all",
+    )
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.workers()) == get_available_gpus()
+    assert len(tr.tensors(collection="weights"))
+    assert (
+        len(tr.tensor(tr.tensors(collection="weights")[0]).workers(0))
+        == strategy.num_replicas_in_sync
+    )
+
+    assert "conv2d/weights/conv2d/kernel:0" in tr.tensors(collection="weights")
+    assert (
+        len(tr.tensor("conv2d/weights/conv2d/kernel:0").workers(0)) == strategy.num_replicas_in_sync
+    )
+
+    assert len(tr.tensors(collection="biases"))
+    assert "conv2d/weights/conv2d/bias:0" in tr.tensors(collection="biases")
+    assert (
+        len(tr.tensor(tr.tensors(collection="biases")[0]).workers(0))
+        == strategy.num_replicas_in_sync
+    )
+    assert len(tr.tensors(collection="gradients"))
+
+
+@pytest.mark.slow
+def test_base_reductions(out_dir):
+    train_model(
+        out_dir,
+        include_collections=[
+            CollectionKeys.WEIGHTS,
+            CollectionKeys.BIASES,
+            CollectionKeys.METRICS,
+            CollectionKeys.LOSSES,
+        ],
+        reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS),
+        steps=["train"],
+    )
+
+    tr = create_trial_fast_refresh(out_dir)
+    weight_name = tr.tensors(collection=CollectionKeys.WEIGHTS)[0]
+
+    try:
+        tr.tensor(weight_name).value(0)
+        assert False
+    except TensorUnavailableForStep:
+        assert tr.tensor(weight_name).reduction_values(0)
+
+    loss_name = tr.tensors(collection=CollectionKeys.LOSSES)[0]
+    assert tr.tensor(loss_name).value(0) is not None
+
+    metric_name = tr.tensors(collection=CollectionKeys.METRICS)[0]
+    assert tr.tensor(metric_name).value(0) is not None
+
+
+@pytest.mark.slow
+def test_collection_reductions(out_dir):
+    tf.reset_default_graph()
+    tf.keras.backend.clear_session()
+    hook = KerasHook(
+        out_dir=out_dir,
+        save_config=SaveConfig(save_interval=3),
+        include_collections=[
+            CollectionKeys.WEIGHTS,
+            CollectionKeys.BIASES,
+            CollectionKeys.GRADIENTS,
+        ],
+    )
+    hook.get_collection(CollectionKeys.GRADIENTS).reduction_config = ReductionConfig(norms=["l1"])
+    train_model(out_dir, hook=hook, steps=["train"])
+
+    tr = create_trial_fast_refresh(out_dir)
+    weight_name = tr.tensors(collection=CollectionKeys.WEIGHTS)[0]
+    grad_name = tr.tensors(collection=CollectionKeys.GRADIENTS)[0]
+
+    try:
+        tr.tensor(weight_name).value(0)
+        tr.tensor(grad_name).value(0)
+        assert False
+    except TensorUnavailableForStep:
+        try:
+            assert tr.tensor(weight_name).reduction_value(0, "l1") is not None
+        except ValueError:
+            # some tensors reduction can't be computed
+            pass
+    except TensorUnavailable:
+        # sometimes we might not have tensor saved if it was only being
+        # saved as reduction and the reduction computation failed
+        pass
+
+
+@pytest.mark.slow
+def test_training_end(out_dir):
+    train_model(out_dir, include_collections=[CollectionKeys.OUTPUTS], steps=["train"])
+    assert has_training_ended(out_dir) is True
+
+
+@pytest.mark.slow
+def test_collection_add(out_dir):
+    strategy = train_model(
+        out_dir,
+        include_collections=["relu"],
+        save_config=SaveConfig(save_interval=9),
+        create_relu_collection=True,
+        steps=["train"],
+    )
+
+    tr = create_trial_fast_refresh(out_dir)
+    relu_coll_tensor_names = tr.tensors(collection="relu")
+
+    assert len(relu_coll_tensor_names) == strategy.num_replicas_in_sync * 2
+    assert tr.tensor(relu_coll_tensor_names[0]).value(0) is not None
+    assert tr.tensor(relu_coll_tensor_names[1]).value(0) is not None
+
+
+@pytest.mark.slow
+def test_include_regex(out_dir):
+    hook = KerasHook(
+        out_dir=out_dir,
+        save_config=SaveConfig(save_interval=9),
+        include_collections=["custom_coll"],
+        include_workers="all",
+    )
+    hook.get_collection("custom_coll").include("dense")
+    strategy = train_model(out_dir, hook=hook, steps=["train"])
+
+    tr = create_trial_fast_refresh(out_dir)
+    tnames = tr.tensors(collection="custom_coll")
+
+    assert len(tnames) == 4 + 3 * strategy.num_replicas_in_sync
+    for tname in tnames:
+        assert tr.tensor(tname).value(0) is not None
+
+
+@pytest.mark.slow
+def test_clash_with_tb_callback(out_dir):
+    train_model(
+        out_dir,
+        save_config=SaveConfig(save_interval=9),
+        include_collections=[
+            CollectionKeys.WEIGHTS,
+            CollectionKeys.BIASES,
+            CollectionKeys.GRADIENTS,
+            CollectionKeys.LOSSES,
+            CollectionKeys.METRICS,
+        ],
+        steps=["train"],
+        add_callbacks=["tensorboard"],
+    )
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.tensors()) == 16
+
+
+@pytest.mark.slow
+def test_clash_with_custom_callback(out_dir):
+    strategy = train_model(
+        out_dir,
+        include_collections=[
+            CollectionKeys.WEIGHTS,
+            CollectionKeys.BIASES,
+            CollectionKeys.OUTPUTS,
+            CollectionKeys.GRADIENTS,
+        ],
+        save_config=SaveConfig(save_interval=9),
+        steps=["train"],
+        add_callbacks=["fetch_tensor"],
+    )
+    tr = create_trial_fast_refresh(out_dir)
+    assert len(tr.tensors()) == 6 + 6 + strategy.num_replicas_in_sync * 1 + 3
+
+
+def test_one_device(out_dir):
+    strategy = train_model(
+        out_dir,
+        include_collections=[
+            CollectionKeys.WEIGHTS,
+            CollectionKeys.BIASES,
+            CollectionKeys.OUTPUTS,
+            CollectionKeys.GRADIENTS,
+        ],
+        save_config=SaveConfig(save_interval=9),
+        strategy=tf.distribute.OneDeviceStrategy(device="/cpu:0"),
+        steps=["train"],
+    )
+    assert os.path.isdir(os.path.join(out_dir, "events")) is False
diff --git a/tests/zero_code_change/tests/tensorflow/utils.py b/tests/zero_code_change/tests/tensorflow/utils.py
new file mode 100644
index 000000000..e8e3d8af6
--- /dev/null
+++ b/tests/zero_code_change/tests/tensorflow/utils.py
@@ -0,0 +1,8 @@
+# First Party
+from smdebug.trials import create_trial
+
+
+def create_trial_fast_refresh(path, **kwargs):
+    tr = create_trial(path, **kwargs)
+    tr.training_end_delay_refresh = 0.01
+    return tr
diff --git a/tests/zero_code_change/tests_path.py b/tests/zero_code_change/tests_path.py
deleted file mode 100644
index f58ea66ce..000000000
--- a/tests/zero_code_change/tests_path.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Standard Library
-import os
-import sys
-
-# Hack to import tests.tensorflow
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

From ce13ca4780ad6f831d37f6aa560d9d2d58dc293f Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Sat, 23 Nov 2019 16:12:17 -0800
Subject: [PATCH 04/11] Release 0.4.11

---
 smdebug/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smdebug/_version.py b/smdebug/_version.py
index 805e7c470..58ce5cd17 100644
--- a/smdebug/_version.py
+++ b/smdebug/_version.py
@@ -1 +1 @@
-__version__ = "0.4.10"
+__version__ = "0.4.11"

From 04717d24e7085efaee027e88c58e17cf47cc6770 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Tue, 26 Nov 2019 16:15:28 -0800
Subject: [PATCH 05/11] Update version

---
 smdebug/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smdebug/_version.py b/smdebug/_version.py
index 58ce5cd17..9b084a609 100644
--- a/smdebug/_version.py
+++ b/smdebug/_version.py
@@ -1 +1 @@
-__version__ = "0.4.11"
+__version__ = "0.4.12"

From d634b240978bd9a8958ccb3fac458079d3b32814 Mon Sep 17 00:00:00 2001
From: Owen Thomas <31292660+owen-t@users.noreply.github.com>
Date: Fri, 29 Nov 2019 18:14:17 -0800
Subject: [PATCH 06/11] Pass iteration_number to metrics.log_metric as keyword
 argument. Fix bug where it was being passed to the timestamp positional
 argument. (#62)

Overriding CI fail due to urgency and clear fix.

(cherry picked from commit 41fdc8088b70769a592c13931a06e765615485b6)
---
 smdebug/core/hook.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py
index 8782ded1f..f4bd1f59e 100644
--- a/smdebug/core/hook.py
+++ b/smdebug/core/hook.py
@@ -569,7 +569,8 @@ def _write_scalars(self):
             write_tb = scalar_obj.write_tb
             write_event = scalar_obj.write_event
             if self.metrics_writer and sm_metric:
-                self.metrics_writer.log_metric(scalar_name, scalar_val, self.mode_steps[self.mode])
+                self.metrics_writer.log_metric(scalar_name, scalar_val,
+                                               iteration_number=self.mode_steps[self.mode])
             if write_tb:
                 tb_writer = self._maybe_get_tb_writer()
                 if tb_writer:

From 6302a298c050f8b39649a5dca40b2b3e237c8527 Mon Sep 17 00:00:00 2001
From: Denis Davydenko <dden@amazon.com>
Date: Fri, 29 Nov 2019 21:43:57 -0800
Subject: [PATCH 07/11] Update version (0.4.12 -> 0.4.13)

---
 smdebug/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smdebug/_version.py b/smdebug/_version.py
index 9b084a609..4b2ce7df3 100644
--- a/smdebug/_version.py
+++ b/smdebug/_version.py
@@ -1 +1 @@
-__version__ = "0.4.12"
+__version__ = "0.4.13"

From 86f33836fe355cbc527fc916d02f62df897291b4 Mon Sep 17 00:00:00 2001
From: Denis Davydenko <dzianis.davydzenka@gmail.com>
Date: Mon, 2 Dec 2019 12:31:48 -0800
Subject: [PATCH 08/11] Update NOTICE

(cherry picked from commit 647f0005d3df452192d7565da76b7a0185bb0d6d)
---
 NOTICE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NOTICE b/NOTICE
index 17ce381eb..740e5df1f 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,2 +1,2 @@
-Tornasole_core
+Amazon SageMaker Debugger
 Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.

From a3de06dab8c7039ff4392a5993a2c63dd3680942 Mon Sep 17 00:00:00 2001
From: Denis Davydenko <dzianis.davydzenka@gmail.com>
Date: Mon, 2 Dec 2019 13:52:34 -0800
Subject: [PATCH 09/11] Create THIRD-PARTY

(cherry picked from commit 5bfdbf439f966a86ea5904d527d9e39597fdfa97)
---
 THIRD-PARTY | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 203 insertions(+)
 create mode 100644 THIRD-PARTY

diff --git a/THIRD-PARTY b/THIRD-PARTY
new file mode 100644
index 000000000..1a95df9c9
--- /dev/null
+++ b/THIRD-PARTY
@@ -0,0 +1,203 @@
+** Tensorboard; version 1.13.0 -- https://github.com/tensorflow/tensorboard
+Copyright 2017 The TensorFlow Authors.  All rights reserved.
+
+Apache License
+
+Version 2.0, January 2004
+
+http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND
+DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction, and
+      distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by the
+      copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all other
+      entities that control, are controlled by, or are under common control
+      with that entity. For the purposes of this definition, "control" means
+      (i) the power, direct or indirect, to cause the direction or management
+      of such entity, whether by contract or otherwise, or (ii) ownership of
+      fifty percent (50%) or more of the outstanding shares, or (iii)
+      beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity exercising
+      permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation source,
+      and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but not limited
+      to compiled object code, generated documentation, and conversions to
+      other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or Object
+      form, made available under the License, as indicated by a copyright
+      notice that is included in or attached to the work (an example is
+      provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object form,
+      that is based on (or derived from) the Work and for which the editorial
+      revisions, annotations, elaborations, or other modifications represent,
+      as a whole, an original work of authorship. For the purposes of this
+      License, Derivative Works shall not include works that remain separable
+      from, or merely link (or bind by name) to the interfaces of, the Work and
+      Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including the original
+      version of the Work and any modifications or additions to that Work or
+      Derivative Works thereof, that is intentionally submitted to Licensor for
+      inclusion in the Work by the copyright owner or by an individual or Legal
+      Entity authorized to submit on behalf of the copyright owner. For the
+      purposes of this definition, "submitted" means any form of electronic,
+      verbal, or written communication sent to the Licensor or its
+      representatives, including but not limited to communication on electronic
+      mailing lists, source code control systems, and issue tracking systems
+      that are managed by, or on behalf of, the Licensor for the purpose of
+      discussing and improving the Work, but excluding communication that is
+      conspicuously marked or otherwise designated in writing by the copyright
+      owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity on
+      behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of this
+   License, each Contributor hereby grants to You a perpetual, worldwide,
+   non-exclusive, no-charge, royalty-free, irrevocable copyright license to
+   reproduce, prepare Derivative Works of, publicly display, publicly perform,
+   sublicense, and distribute the Work and such Derivative Works in Source or
+   Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of this
+   License, each Contributor hereby grants to You a perpetual, worldwide,
+   non-exclusive, no-charge, royalty-free, irrevocable (except as stated in
+   this section) patent license to make, have made, use, offer to sell, sell,
+   import, and otherwise transfer the Work, where such license applies only to
+   those patent claims licensable by such Contributor that are necessarily
+   infringed by their Contribution(s) alone or by combination of their
+   Contribution(s) with the Work to which such Contribution(s) was submitted.
+   If You institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+   Contribution incorporated within the Work constitutes direct or contributory
+   patent infringement, then any patent licenses granted to You under this
+   License for that Work shall terminate as of the date such litigation is
+   filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the Work or
+   Derivative Works thereof in any medium, with or without modifications, and
+   in Source or Object form, provided that You meet the following conditions:
+
+      (a) You must give any other recipients of the Work or Derivative Works a
+      copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices stating
+      that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works that You
+      distribute, all copyright, patent, trademark, and attribution notices
+      from the Source form of the Work, excluding those notices that do not
+      pertain to any part of the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+      distribution, then any Derivative Works that You distribute must include
+      a readable copy of the attribution notices contained within such NOTICE
+      file, excluding those notices that do not pertain to any part of the
+      Derivative Works, in at least one of the following places: within a
+      NOTICE text file distributed as part of the Derivative Works; within the
+      Source form or documentation, if provided along with the Derivative
+      Works; or, within a display generated by the Derivative Works, if and
+      wherever such third-party notices normally appear. The contents of the
+      NOTICE file are for informational purposes only and do not modify the
+      License. You may add Your own attribution notices within Derivative Works
+      that You distribute, alongside or as an addendum to the NOTICE text from
+      the Work, provided that such additional attribution notices cannot be
+      construed as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and may
+      provide additional or different license terms and conditions for use,
+      reproduction, or distribution of Your modifications, or for any such
+      Derivative Works as a whole, provided Your use, reproduction, and
+      distribution of the Work otherwise complies with the conditions stated in
+      this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise, any
+   Contribution intentionally submitted for inclusion in the Work by You to the
+   Licensor shall be under the terms and conditions of this License, without
+   any additional terms or conditions. Notwithstanding the above, nothing
+   herein shall supersede or modify the terms of any separate license agreement
+   you may have executed with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor, except
+   as required for reasonable and customary use in describing the origin of the
+   Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or agreed to in
+   writing, Licensor provides the Work (and each Contributor provides its
+   Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied, including, without limitation, any
+   warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or
+   FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining
+   the appropriateness of using or redistributing the Work and assume any risks
+   associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory, whether
+   in tort (including negligence), contract, or otherwise, unless required by
+   applicable law (such as deliberate and grossly negligent acts) or agreed to
+   in writing, shall any Contributor be liable to You for damages, including
+   any direct, indirect, special, incidental, or consequential damages of any
+   character arising as a result of this License or out of the use or inability
+   to use the Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all other
+   commercial damages or losses), even if such Contributor has been advised of
+   the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing the Work
+   or Derivative Works thereof, You may choose to offer, and charge a fee for,
+   acceptance of support, warranty, indemnity, or other liability obligations
+   and/or rights consistent with this License. However, in accepting such
+   obligations, You may act only on Your own behalf and on Your sole
+   responsibility, not on behalf of any other Contributor, and only if You
+   agree to indemnify, defend, and hold each Contributor harmless for any
+   liability incurred by, or claims asserted against, such Contributor by
+   reason of your accepting any such warranty or additional liability. END OF
+   TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets "[]" replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same "printed page" as the copyright notice for easier identification
+within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+you may not use this file except in compliance with the License.
+
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+
+distributed under the License is distributed on an "AS IS" BASIS,
+
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+See the License for the specific language governing permissions and
+
+limitations under the License.
+
+* For Tensorboard see also this required NOTICE:
+    Copyright 2017 The TensorFlow Authors.  All rights reserved.

From 0187212fc3e8e93c48b5687b2e584d7335c42c08 Mon Sep 17 00:00:00 2001
From: Denis Davydenko <dzianis.davydzenka@gmail.com>
Date: Mon, 2 Dec 2019 13:58:10 -0800
Subject: [PATCH 10/11] Update THIRD-PARTY

(cherry picked from commit e18a8bffdbbdfaeb5f38a16ba2e048b2ec60161c)
---
 THIRD-PARTY | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/THIRD-PARTY b/THIRD-PARTY
index 1a95df9c9..9bb026fce 100644
--- a/THIRD-PARTY
+++ b/THIRD-PARTY
@@ -1,6 +1,8 @@
 ** Tensorboard; version 1.13.0 -- https://github.com/tensorflow/tensorboard
 Copyright 2017 The TensorFlow Authors.  All rights reserved.
 
+This project has been modified for use in Amazon SageMaker Debugger.
+
 Apache License
 
 Version 2.0, January 2004

From b20b2e42730441b598eeadaaf9022d9ba8410aa7 Mon Sep 17 00:00:00 2001
From: Denis Davydenko <dden@amazon.com>
Date: Mon, 2 Dec 2019 15:54:33 -0800
Subject: [PATCH 11/11] Mention modifications of original .proto files from TB.

---
 THIRD-PARTY => LICENSE-THIRD-PARTY               | 0
 setup.py                                         | 1 +
 smdebug/core/tfevent/proto/attr_value.proto      | 1 +
 smdebug/core/tfevent/proto/event.proto           | 1 +
 smdebug/core/tfevent/proto/graph.proto           | 1 +
 smdebug/core/tfevent/proto/node_def.proto        | 1 +
 smdebug/core/tfevent/proto/resource_handle.proto | 1 +
 smdebug/core/tfevent/proto/summary.proto         | 1 +
 smdebug/core/tfevent/proto/tensor.proto          | 1 +
 smdebug/core/tfevent/proto/tensor_shape.proto    | 1 +
 smdebug/core/tfevent/proto/types.proto           | 1 +
 smdebug/core/tfevent/proto/versions.proto        | 1 +
 12 files changed, 11 insertions(+)
 rename THIRD-PARTY => LICENSE-THIRD-PARTY (100%)

diff --git a/THIRD-PARTY b/LICENSE-THIRD-PARTY
similarity index 100%
rename from THIRD-PARTY
rename to LICENSE-THIRD-PARTY
diff --git a/setup.py b/setup.py
index 8b39686ce..85fce806f 100644
--- a/setup.py
+++ b/setup.py
@@ -58,6 +58,7 @@ def build_package(version):
         setup_requires=["pytest-runner"],
         tests_require=TESTS_PACKAGES,
         python_requires=">=3.6",
+        license='Apache License Version 2.0'
     )
 
 
diff --git a/smdebug/core/tfevent/proto/attr_value.proto b/smdebug/core/tfevent/proto/attr_value.proto
index 5fe54e43e..de7041229 100644
--- a/smdebug/core/tfevent/proto/attr_value.proto
+++ b/smdebug/core/tfevent/proto/attr_value.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;
diff --git a/smdebug/core/tfevent/proto/event.proto b/smdebug/core/tfevent/proto/event.proto
index 73a7f6ab5..a7c91943e 100644
--- a/smdebug/core/tfevent/proto/event.proto
+++ b/smdebug/core/tfevent/proto/event.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;
diff --git a/smdebug/core/tfevent/proto/graph.proto b/smdebug/core/tfevent/proto/graph.proto
index 1c5090ae7..dae5c33fa 100644
--- a/smdebug/core/tfevent/proto/graph.proto
+++ b/smdebug/core/tfevent/proto/graph.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;
diff --git a/smdebug/core/tfevent/proto/node_def.proto b/smdebug/core/tfevent/proto/node_def.proto
index b79617153..6fa628291 100644
--- a/smdebug/core/tfevent/proto/node_def.proto
+++ b/smdebug/core/tfevent/proto/node_def.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;
diff --git a/smdebug/core/tfevent/proto/resource_handle.proto b/smdebug/core/tfevent/proto/resource_handle.proto
index 6536161bb..7a20b9e61 100644
--- a/smdebug/core/tfevent/proto/resource_handle.proto
+++ b/smdebug/core/tfevent/proto/resource_handle.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;
diff --git a/smdebug/core/tfevent/proto/summary.proto b/smdebug/core/tfevent/proto/summary.proto
index 5a8c41b94..10206307f 100644
--- a/smdebug/core/tfevent/proto/summary.proto
+++ b/smdebug/core/tfevent/proto/summary.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;
diff --git a/smdebug/core/tfevent/proto/tensor.proto b/smdebug/core/tfevent/proto/tensor.proto
index 193df010a..56b87bbd5 100644
--- a/smdebug/core/tfevent/proto/tensor.proto
+++ b/smdebug/core/tfevent/proto/tensor.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;
diff --git a/smdebug/core/tfevent/proto/tensor_shape.proto b/smdebug/core/tfevent/proto/tensor_shape.proto
index e7c87d666..91c502032 100644
--- a/smdebug/core/tfevent/proto/tensor_shape.proto
+++ b/smdebug/core/tfevent/proto/tensor_shape.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 // Protocol buffer representing the shape of tensors.
 
 syntax = "proto3";
diff --git a/smdebug/core/tfevent/proto/types.proto b/smdebug/core/tfevent/proto/types.proto
index 15cea1995..df7e65f05 100644
--- a/smdebug/core/tfevent/proto/types.proto
+++ b/smdebug/core/tfevent/proto/types.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;
diff --git a/smdebug/core/tfevent/proto/versions.proto b/smdebug/core/tfevent/proto/versions.proto
index c27a8d91a..80ed94c45 100644
--- a/smdebug/core/tfevent/proto/versions.proto
+++ b/smdebug/core/tfevent/proto/versions.proto
@@ -1,3 +1,4 @@
+/* This file was modified for Amazon Sagemaker Debugger from the original version in https://github.com/tensorflow/tensorboard. */
 syntax = "proto3";
 
 package smdebug;