From ac3c25016896cc934394cf3cff38032d7cb70028 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 10 Aug 2020 12:42:35 -0700
Subject: [PATCH 01/31] WIP saveshape

---
 smdebug/core/hook.py             | 23 +++++++++++++++++++++++
 smdebug/core/reduction_config.py | 18 +++++++++++++-----
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py
index 10577fed6..e243f6149 100644
--- a/smdebug/core/hook.py
+++ b/smdebug/core/hook.py
@@ -343,6 +343,12 @@ def _get_collections_to_save_for_step(self) -> Set["Collection"]:
                 )
         return self._collections_to_save_for_step
 
+    def _saving_shapes_in_step(self) -> bool:
+        for coll in self._get_collections_to_save_for_step():
+            if coll.reduction_config.save_shape is True:
+                return True
+        return False
+
     def _get_collections_with_tensor(self, tensor_name) -> Set["Collection"]:
         self._assert_prep()
         # for tf this will be prepopulated in check_and_add_tensor
@@ -456,6 +462,9 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
                 return
         self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
 
+        if self._saving_shapes_in_step():
+            self.shape_writer = 
+
     def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
         """
         :param tensor_name:
@@ -725,6 +734,16 @@ def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_
             if reduction_config.save_raw_tensor is True:
                 self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref)
                 break
+    
+    def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None):
+        for s_col in save_collections:
+            reduction_config = s_col.reduction_config
+            if reduction_config.save_shape is True:
+                numpy_tensor_value = self._make_numpy_array(tensor_value)
+                this_size, this_shape = size_and_shape(numpy_tensor_value)
+
+                self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref)
+                break
 
     def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None):
         # tensor_ref is used by TF
@@ -742,6 +761,7 @@ def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, t
                     timestamp=timestamp,
                 )
 
+
     def _save_for_tensor(self, tensor_name, tensor_value, check_before_write=True):
         """
         Identifies if this tensor should be saved for this step
@@ -805,6 +825,9 @@ def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_
         :param save_collections: list of collections which are being saved for this step
         """
         self._log_save(tensor_name, save_collections)
+
+        self._write_shape(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref)
+
         # write reductions defined for collections this tensor may be part of
         self._write_reductions(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref)
 
diff --git a/smdebug/core/reduction_config.py b/smdebug/core/reduction_config.py
index 9a24ff6d6..6ad78cdee 100644
--- a/smdebug/core/reduction_config.py
+++ b/smdebug/core/reduction_config.py
@@ -1,6 +1,8 @@
 # Standard Library
 import json
 from typing import Any, Dict
+from smdebug.core.logger import get_logger
+logger = get_logger()
 
 # First Party
 from smdebug.core.utils import split
@@ -8,7 +10,7 @@
 ALLOWED_REDUCTIONS = ["min", "max", "mean", "std", "variance", "sum", "prod"]
 ALLOWED_NORMS = ["l1", "l2"]
 REDUCTION_CONFIG_VERSION_NUM = "v0"
-ALLOWED_PARAMS = ["reductions", "abs_reductions", "norms", "abs_norms", "save_raw_tensor"]
+ALLOWED_PARAMS = ["reductions", "abs_reductions", "norms", "abs_norms", "save_raw_tensor", "save_shape"]
 
 
 class ReductionConfig:
@@ -49,12 +51,14 @@ def __init__(
         norms=None,
         abs_norms=None,
         save_raw_tensor=False,
+        save_shape=False,
     ):
         self.reductions = reductions if reductions is not None else []
         self.abs_reductions = abs_reductions if abs_reductions is not None else []
         self.norms = norms if norms is not None else []
         self.abs_norms = abs_norms if abs_norms is not None else []
         self.save_raw_tensor = save_raw_tensor
+        self.save_shape = save_shape
         ## DO NOT REMOVE, if you add anything here, please make sure that _check & from_json is updated accordingly
         self._check()
 
@@ -75,6 +79,9 @@ def _check(self):
             raise ValueError("abs_norms can only be one of " + ",".join(ALLOWED_NORMS))
         if not isinstance(self.save_raw_tensor, bool):
             raise ValueError(f"save_raw_tensor={self.save_raw_tensor} must be a boolean")
+        if not isinstance(self.save_shape, bool):
+            raise ValueError(f"save_shape={self.save_shape} must be a boolean")
+
 
     @classmethod
     def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig":
@@ -83,7 +90,7 @@ def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig":
             return None
         if not isinstance(params, dict):
             raise ValueError(f"params={params} must be dict")
-
+        save_shape = params.get("save_shape", False)
         save_raw_tensor = params.get("save_raw_tensor", False)
         # Parse comma-separated string into array
         all_reductions = split(params.get("reductions", ""))
@@ -108,6 +115,7 @@ def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig":
             norms=norms,
             abs_norms=abs_norms,
             save_raw_tensor=save_raw_tensor,
+            save_shape=save_shape
         )
 
     @classmethod
@@ -116,7 +124,6 @@ def from_json(cls, json_str: str) -> "ReductionConfig":
         return cls.from_dict(d)
 
     def to_json_dict(self) -> Dict[str, Any]:
-        save_raw_tensor = self.save_raw_tensor
         # Convert reductions from various arrays into single comma-separated string
         all_reductions = []
         for red in self.reductions:
@@ -129,7 +136,7 @@ def to_json_dict(self) -> Dict[str, Any]:
             all_reductions.append(f"abs_{red}_norm")
         all_reductions_str = ",".join(all_reductions)
         # Return the dict
-        return {"save_raw_tensor": save_raw_tensor, "reductions": all_reductions_str}
+        return {"save_raw_tensor": self.save_raw_tensor, "reductions": all_reductions_str, "save_shape": self.save_shape}
 
     def to_json(self) -> str:
         return json.dumps(self.to_json_dict())
@@ -144,10 +151,11 @@ def __eq__(self, other):
             and self.norms == other.norms
             and self.abs_norms == other.abs_norms
             and self.save_raw_tensor == other.save_raw_tensor
+            and self.save_shape == other.save_shape
         )
 
     def __repr__(self):
         return (
             f"<class ReductionConfig: reductions={self.reductions}, "
-            f"abs_reductions={self.abs_reductions}, norms={self.norms}, abs_norms={self.abs_norms}>"
+            f"abs_reductions={self.abs_reductions}, norms={self.norms}, abs_norms={self.abs_norms}>, save_shape={self.save_shape}, save_raw_tensor={self.save_raw_tensor}"
         )

From 34084cc2d534e310733add30a5b6d44dc0e997fc Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 10 Aug 2020 13:42:41 -0700
Subject: [PATCH 02/31] Add shape writer

---
 smdebug/core/hook.py            | 16 +++---
 smdebug/core/locations.py       | 22 ++++++++
 smdebug/core/writer.py          | 95 ++++++++++++++++++++++++---------
 smdebug/tensorflow/base_hook.py |  8 +++
 4 files changed, 109 insertions(+), 32 deletions(-)

diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py
index e243f6149..aaa640cc9 100644
--- a/smdebug/core/hook.py
+++ b/smdebug/core/hook.py
@@ -424,7 +424,6 @@ def _close_writers(self) -> None:
             self.writer = None
 
         to_delete_writers = []
-
         # Delete all the tb writers
         for mode, writer in self.tb_writers.items():
             if writer is not None:
@@ -434,6 +433,9 @@ def _close_writers(self) -> None:
         for mode in to_delete_writers:
             del self.tb_writers[mode]
 
+        self.shape_writer.close()
+        self.shape_writer = None
+
     def _initialize_writers(self, only_initialize_if_missing=False) -> None:
         # Function is overridden in smdebug/tensorflow/base_hook.py
         if only_initialize_if_missing and self.writer:
@@ -463,7 +465,9 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
         self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
 
         if self._saving_shapes_in_step():
-            self.shape_writer = 
+            self.shape_writer = ShapeWriter(
+                trial_dir=self.out_dir, step=self.step, worker=self.worker
+            )
 
     def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
         """
@@ -734,15 +738,14 @@ def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_
             if reduction_config.save_raw_tensor is True:
                 self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref)
                 break
-    
+
     def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None):
         for s_col in save_collections:
             reduction_config = s_col.reduction_config
-            if reduction_config.save_shape is True:
+            if self.dry_run is False and reduction_config.save_shape is True:
                 numpy_tensor_value = self._make_numpy_array(tensor_value)
                 this_size, this_shape = size_and_shape(numpy_tensor_value)
-
-                self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref)
+                self.shape_writer.write_shape(tensor_name, this_shape)
                 break
 
     def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None):
@@ -761,7 +764,6 @@ def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, t
                     timestamp=timestamp,
                 )
 
-
     def _save_for_tensor(self, tensor_name, tensor_value, check_before_write=True):
         """
         Identifies if this tensor should be saved for this step
diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py
index af703e514..746a7a6cd 100644
--- a/smdebug/core/locations.py
+++ b/smdebug/core/locations.py
@@ -89,6 +89,28 @@ def get_step_dir_path(cls, trial_dir, step_num):
         return os.path.join(cls.get_dir(trial_dir), format(step_num, STEP_NUMBER_FORMATTING_LENGTH))
 
 
+class ShapeFileLocation(TensorFileLocation):
+    def __init__(self, step_num, worker_name):
+        super().__init__(step_num, worker_name)
+
+    def get_filename(self):
+        step_num_str = self.get_step_num_str()
+        return f"{step_num_str}_{self.worker_name}_shapes.json"
+
+    @classmethod
+    def load_filename(cls, s, print_error=True):
+        name = os.path.basename(s)
+        m = re.search("(.*)_(.*)_shapes.json$", name)
+        if m:
+            step_num = int(m.group(1))
+            worker_name = m.group(2)
+            return cls(step_num=step_num, worker_name=worker_name)
+        else:
+            if print_error:
+                logger.error("Failed to load shape file location: ", s)
+            return None
+
+
 class TensorboardFileLocation(EventFileLocation):
     def __init__(self, step_num, worker_name, mode=None):
         super().__init__(step_num, worker_name)
diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py
index a342cf433..5bda07c0c 100644
--- a/smdebug/core/writer.py
+++ b/smdebug/core/writer.py
@@ -16,6 +16,9 @@
 # under the License.
 
 """APIs for logging data in the event file."""
+# Standard Library
+from typing import Tuple
+
 # First Party
 from smdebug.core.modes import MODE_PLUGIN_NAME, MODE_STEP_PLUGIN_NAME
 from smdebug.core.tfevent.event_file_writer import EventFileWriter
@@ -38,7 +41,36 @@
 logger = get_logger()
 
 
-class FileWriter:
+class BaseWriter:
+    def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL):
+        self.trial_dir = trial_dir
+        self.step = step
+        self.worker = worker
+        if worker is None:
+            assert False, "Worker should not be none. Check worker name initialization"
+        self.mode = mode
+        self._writer = None
+
+    def name(self):
+        return self._writer.name()
+
+    def __enter__(self):
+        """Make usable with "with" statement."""
+        return self
+
+    def __exit__(self, unused_type, unused_value, unused_traceback):
+        """Make usable with "with" statement."""
+        self.close()
+
+    def flush(self):
+        """Flushes the event file to disk.
+        Call this method to make sure that all pending events have been written to disk.
+        """
+        self._writer.flush()
+        # don't flush index writer as we only want to flush on close
+
+
+class FileWriter(BaseWriter):
     def __init__(
         self,
         trial_dir,
@@ -71,12 +103,7 @@ def __init__(
             verbose : bool
                 Determines whether to print logging messages.
         """
-        self.trial_dir = trial_dir
-        self.step = step
-        self.worker = worker
-        if worker is None:
-            assert False, "Worker should not be none. Check worker name initialization"
-        self.mode = mode
+        super(FileWriter, self).__init__(trial_dir, worker, step, mode)
         if wtype == "events":
             el = TensorFileLocation(step_num=self.step, worker_name=self.worker)
             event_file_path = el.get_file_location(trial_dir=self.trial_dir)
@@ -103,14 +130,6 @@ def __init__(
         )
         self._default_bins = _get_default_bins()
 
-    def __enter__(self):
-        """Make usable with "with" statement."""
-        return self
-
-    def __exit__(self, unused_type, unused_value, unused_traceback):
-        """Make usable with "with" statement."""
-        self.close()
-
     @staticmethod
     def _get_metadata(mode, mode_step):
         sm2 = SummaryMetadata.PluginData(plugin_name=MODE_STEP_PLUGIN_NAME, content=str(mode_step))
@@ -187,13 +206,6 @@ def write_scalar_summary(self, name, value, global_step, timestamp: float = None
         s = scalar_summary(name, value)
         self._writer.write_summary(s, global_step, timestamp=timestamp)
 
-    def flush(self):
-        """Flushes the event file to disk.
-        Call this method to make sure that all pending events have been written to disk.
-        """
-        self._writer.flush()
-        # don't flush index writer as we only want to flush on close
-
     def close(self):
         """Flushes the event file to disk and close the file.
         Call this method when you do not need the summary writer anymore.
@@ -202,9 +214,6 @@ def close(self):
         if self.index_writer is not None:
             self.index_writer.close()
 
-    def name(self):
-        return self._writer.name()
-
     @staticmethod
     def _check_mode_step(mode, mode_step, global_step):
         if mode_step is None:
@@ -216,3 +225,39 @@ def _check_mode_step(mode, mode_step, global_step):
             ex_str = "mode can be one of " + ", ".join(mode_keys)
             raise ValueError(ex_str)
         return mode, mode_step
+
+
+class ShapeWriter(BaseWriter):
+    def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL):
+        super(ShapeWriter, self).__init__(trial_dir, worker, step, mode)
+        el = ShapeFileLocation(step_num=self.step, worker_name=self.worker, mode=self.mode)
+        self.file_path = el.get_file_location(base_dir=self.trial_dir)
+        s3, bucket_name, key_name = is_s3(self.file_path)
+        if s3:
+            self._writer = TSAccessS3(bucket_name, key_name, binary=False)
+        else:
+            self._writer = TSAccessFile(self.file_path, "a+")
+
+        self.shapes = []
+        self.meta = {}
+
+    def write_shape(self, name, shape: Tuple[int]):
+        self.shapes.append({"name": name, "shape": shape})
+
+    def flush(self):
+        if not self._writer:
+            raise ValueError(f"Cannot flush because self._writer={self._writer}")
+        if not self.shapes:
+            raise ValueError(f"Cannot write shapes to file {self.file_path} as it is empty")
+
+        s = json.dumps({"meta": self.meta, "payload": self.shapes})
+        self._writer.write(s)
+        self._writer.flush()
+
+    def close(self):
+        """Flushes the event file to disk and close the file.
+        """
+        if self._writer is not None:
+            self.flush()
+            self._writer.close()
+            self._writer = None
diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py
index a8cc9f679..4c935d85c 100644
--- a/smdebug/tensorflow/base_hook.py
+++ b/smdebug/tensorflow/base_hook.py
@@ -341,6 +341,11 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
         else:
             raise NotImplementedError
 
+        if self._saving_shapes_in_step():
+            self.shape_writer = ShapeWriter(
+                trial_dir=self.out_dir, step=self.step, worker=self.worker
+            )
+
     def _close_writers(self) -> None:
         if self.dry_run:
             return
@@ -373,6 +378,9 @@ def _close_writers(self) -> None:
         for mode in to_delete_writers:
             del self.tb_writers[mode]
 
+        self.shape_writer.close()
+        self.shape_writer = None
+
     def _export_model(self):
         tb_writer = self._maybe_get_tb_writer()
         if tb_writer:

From e8a6d988bc24a8e83543d6d7b752c7dc1a90bda0 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 10 Aug 2020 17:43:26 -0700
Subject: [PATCH 03/31] Add pytorch test

---
 smdebug/core/hook.py                | 15 ++++--
 smdebug/core/writer.py              | 16 ++++--
 tests/pytorch/test_reduce_config.py | 82 +++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 8 deletions(-)

diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py
index aaa640cc9..1cd8294d8 100644
--- a/smdebug/core/hook.py
+++ b/smdebug/core/hook.py
@@ -46,7 +46,7 @@
     size_and_shape,
     validate_custom_tensor_value,
 )
-from smdebug.core.writer import FileWriter
+from smdebug.core.writer import FileWriter, ShapeWriter
 from smdebug.exceptions import InvalidCollectionConfiguration
 
 try:
@@ -222,7 +222,7 @@ def __init__(
         self.mode = ModeKeys.GLOBAL
         self.mode_steps = {ModeKeys.GLOBAL: init_step}
         self.writer = None
-
+        self.shape_writer = None
         if is_sagemaker_job() and SageMakerFileMetricsWriter is not None:
             self.metrics_writer = SageMakerFileMetricsWriter()
         else:
@@ -433,8 +433,9 @@ def _close_writers(self) -> None:
         for mode in to_delete_writers:
             del self.tb_writers[mode]
 
-        self.shape_writer.close()
-        self.shape_writer = None
+        if self.shape_writer is not None:
+            self.shape_writer.close()
+            self.shape_writer = None
 
     def _initialize_writers(self, only_initialize_if_missing=False) -> None:
         # Function is overridden in smdebug/tensorflow/base_hook.py
@@ -745,7 +746,11 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N
             if self.dry_run is False and reduction_config.save_shape is True:
                 numpy_tensor_value = self._make_numpy_array(tensor_value)
                 this_size, this_shape = size_and_shape(numpy_tensor_value)
-                self.shape_writer.write_shape(tensor_name, this_shape)
+                if tensor_ref is not None:
+                    name = tensor_ref.tf_obj.name
+                else:
+                    name = tensor_name
+                self.shape_writer.write_shape(name, this_shape)
                 break
 
     def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None):
diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py
index 5bda07c0c..abb606632 100644
--- a/smdebug/core/writer.py
+++ b/smdebug/core/writer.py
@@ -17,9 +17,11 @@
 
 """APIs for logging data in the event file."""
 # Standard Library
+import json
 from typing import Tuple
 
 # First Party
+from smdebug.core.access_layer import TSAccessFile, TSAccessS3
 from smdebug.core.modes import MODE_PLUGIN_NAME, MODE_STEP_PLUGIN_NAME
 from smdebug.core.tfevent.event_file_writer import EventFileWriter
 from smdebug.core.tfevent.index_file_writer import IndexWriter
@@ -32,9 +34,15 @@
     scalar_summary,
 )
 from smdebug.core.tfevent.util import make_tensor_proto
+from smdebug.core.utils import is_s3
 
 # Local
-from .locations import IndexFileLocationUtils, TensorboardFileLocation, TensorFileLocation
+from .locations import (
+    IndexFileLocationUtils,
+    ShapeFileLocation,
+    TensorboardFileLocation,
+    TensorFileLocation,
+)
 from .logger import get_logger
 from .modes import ModeKeys
 
@@ -230,8 +238,8 @@ def _check_mode_step(mode, mode_step, global_step):
 class ShapeWriter(BaseWriter):
     def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL):
         super(ShapeWriter, self).__init__(trial_dir, worker, step, mode)
-        el = ShapeFileLocation(step_num=self.step, worker_name=self.worker, mode=self.mode)
-        self.file_path = el.get_file_location(base_dir=self.trial_dir)
+        el = ShapeFileLocation(step_num=self.step, worker_name=self.worker)
+        self.file_path = el.get_file_location(trial_dir=self.trial_dir)
         s3, bucket_name, key_name = is_s3(self.file_path)
         if s3:
             self._writer = TSAccessS3(bucket_name, key_name, binary=False)
@@ -253,6 +261,8 @@ def flush(self):
         s = json.dumps({"meta": self.meta, "payload": self.shapes})
         self._writer.write(s)
         self._writer.flush()
+        self.meta = {}
+        self.shapes = []
 
     def close(self):
         """Flushes the event file to disk and close the file.
diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py
index 230e0410c..c7b525895 100644
--- a/tests/pytorch/test_reduce_config.py
+++ b/tests/pytorch/test_reduce_config.py
@@ -1,13 +1,18 @@
 # Standard Library
+import json
 import os
 import shutil
 from datetime import datetime
 
 # Third Party
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 import torch.optim as optim
 
 # First Party
+from smdebug.core.config_constants import DEFAULT_WORKER_NAME
+from smdebug.core.locations import ShapeFileLocation
 from smdebug.pytorch import ReductionConfig, SaveConfig
 from smdebug.pytorch.hook import Hook as t_hook
 from smdebug.trials import create_trial
@@ -86,6 +91,83 @@ def test_reduce_config(hook=None, out_dir=None):
         shutil.rmtree(out_dir)
 
 
+def test_save_shapes(hook=None, out_dir=None):
+    class ChildA(nn.Module):
+        def __init__(self):
+            super(ChildA, self).__init__()
+            self.child2 = ChildB()
+            self.relu0 = nn.ReLU()
+
+        def forward(self, x):
+            return self.relu0(self.child2(x))
+
+    class ChildB(nn.Module):
+        def __init__(self):
+            super(ChildB, self).__init__()
+            self.conv1 = nn.Conv2d(1, 20, 5, 1)
+
+        def forward(self, x):
+            return self.conv1(x)
+
+    class NestedNet(nn.Module):
+        def __init__(self):
+            super(NestedNet, self).__init__()
+            self.child1 = ChildA()
+            self.max_pool = nn.MaxPool2d(2, stride=2)
+            self.conv2 = nn.Conv2d(20, 50, 5, 1)
+            relu_module = nn.ReLU()
+            self.relu1 = nn.ReLU()
+            self.max_pool2 = nn.MaxPool2d(2, stride=2)
+            self.fc1 = nn.Linear(4 * 4 * 50, 500)
+            self.relu2 = nn.ReLU()
+            self.fc2 = nn.Linear(500, 10)
+
+        def forward(self, x):
+            x = self.child1(x)
+            x = self.max_pool(x)
+            x = self.relu1(self.conv2(x))
+            x = self.max_pool2(x)
+            x = x.view(-1, 4 * 4 * 50)
+            x = self.relu2(self.fc1(x))
+            x = self.fc2(x)
+            return F.log_softmax(x, dim=1)
+
+    hook_created = False
+    if hook is None:
+        global_reduce_config = ReductionConfig(save_shape=True)
+        global_save_config = SaveConfig(save_steps=[0])
+
+        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
+        out_dir = "/tmp/" + run_id
+        hook = t_hook(
+            out_dir=out_dir,
+            save_config=global_save_config,
+            save_all=True,
+            reduction_config=global_reduce_config,
+        )
+        hook_created = True
+
+    model = NestedNet().to(torch.device("cpu"))
+    hook.register_module(model)
+    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+    train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
+
+    sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME)
+    path = os.path.join(out_dir, sl.get_file_location())
+    with open(path) as jsfile:
+        shape_dict = json.load(jsfile)
+    print(shape_dict["payload"])
+    assert "payload" in shape_dict
+    assert len(shape_dict["payload"]) == 41
+    for ts in shape_dict["payload"]:
+        for dim in ts["shape"]:
+            assert isinstance(dim, int)
+        assert isinstance(ts["name"], str)
+
+    if hook_created:
+        shutil.rmtree(out_dir)
+
+
 # Test creating hook by loading the json file with reduction configs.
 def test_reduce_config_with_json():
     from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR

From 907cf640cff2ba895f8f2bf9cbab23aa42e33a6c Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 10 Aug 2020 18:30:15 -0700
Subject: [PATCH 04/31] Add untested keras test

---
 tests/tensorflow2/test_keras.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index 4323b5284..ce2965514 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -183,6 +183,29 @@ def helper_keras_gradtape(
     hook.close()
 
 
+def test_keras_gradtape_shapes(out_dir):
+    hook = smd.KerasHook(
+        out_dir=out_dir,
+        save_all=saveall,
+        save_config=SaveConfig(save_steps=[0], reduce_config=ReductionConfig(save_shape=True)),
+    )
+    helper_keras_gradtape(trial_dir=out_dir, hook=hook)
+    sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME)
+    path = os.path.join(out_dir, sl.get_file_location())
+    with open(path) as jsfile:
+        shape_dict = json.load(jsfile)
+    print(shape_dict["payload"])
+    assert "payload" in shape_dict
+    assert len(shape_dict["payload"]) == 41
+    for ts in shape_dict["payload"]:
+        for dim in ts["shape"]:
+            assert isinstance(dim, int)
+        assert isinstance(ts["name"], str)
+
+    if hook_created:
+        shutil.rmtree(out_dir)
+
+
 @pytest.mark.skip_if_non_eager
 @pytest.mark.slow
 @pytest.mark.parametrize("saveall", [True, False])

From 86842e6f57be54a9ea686a9edf0a6f9f27d0e552 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 10 Aug 2020 18:43:15 -0700
Subject: [PATCH 05/31] fix syntax

---
 smdebug/tensorflow/base_hook.py | 2 +-
 tests/tensorflow2/test_keras.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py
index 4c935d85c..d2512623a 100644
--- a/smdebug/tensorflow/base_hook.py
+++ b/smdebug/tensorflow/base_hook.py
@@ -16,7 +16,7 @@
 from smdebug.core.reductions import get_numpy_reduction, get_reduction_tensor_name
 from smdebug.core.tfevent.util import make_numpy_array
 from smdebug.core.utils import serialize_tf_device
-from smdebug.core.writer import FileWriter
+from smdebug.core.writer import FileWriter, ShapeWriter
 
 # Local
 from .collection import CollectionKeys, CollectionManager
diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index ce2965514..af355dafe 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -186,8 +186,9 @@ def helper_keras_gradtape(
 def test_keras_gradtape_shapes(out_dir):
     hook = smd.KerasHook(
         out_dir=out_dir,
-        save_all=saveall,
-        save_config=SaveConfig(save_steps=[0], reduce_config=ReductionConfig(save_shape=True)),
+        save_all=True,
+        save_config=SaveConfig(save_steps=[0]),
+        reduction_config=ReductionConfig(save_shape=True),
     )
     helper_keras_gradtape(trial_dir=out_dir, hook=hook)
     sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME)

From 651c4408394a4adff6bc75589a8f0f5e42fc1718 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 10 Aug 2020 18:45:02 -0700
Subject: [PATCH 06/31] fix syntax

---
 smdebug/tensorflow/base_hook.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py
index d2512623a..42b9b07d0 100644
--- a/smdebug/tensorflow/base_hook.py
+++ b/smdebug/tensorflow/base_hook.py
@@ -378,8 +378,9 @@ def _close_writers(self) -> None:
         for mode in to_delete_writers:
             del self.tb_writers[mode]
 
-        self.shape_writer.close()
-        self.shape_writer = None
+        if self.shape_writer is not None:
+            self.shape_writer.close()
+            self.shape_writer = None
 
     def _export_model(self):
         tb_writer = self._maybe_get_tb_writer()

From fc25940a8b1e2fdb898c55cf5ba248dc38acf075 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 10 Aug 2020 18:46:22 -0700
Subject: [PATCH 07/31] Import

---
 tests/tensorflow2/test_keras.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index af355dafe..4c41433af 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -23,6 +23,7 @@
 from smdebug.core.access_layer import has_training_ended
 from smdebug.core.collection import CollectionKeys
 from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
+from smdebug.core.locations import ShapeFileLocation
 from smdebug.core.modes import ModeKeys
 from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
 from smdebug.exceptions import TensorUnavailableForStep

From 1357f5d025018829262bc9952f0f0a3179700a3c Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 10 Aug 2020 18:47:25 -0700
Subject: [PATCH 08/31] Import

---
 tests/tensorflow2/test_keras.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index 4c41433af..f69a5a241 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -22,6 +22,7 @@
 import smdebug.tensorflow as smd
 from smdebug.core.access_layer import has_training_ended
 from smdebug.core.collection import CollectionKeys
+from smdebug.core.config_constants import DEFAULT_WORKER_NAME
 from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
 from smdebug.core.locations import ShapeFileLocation
 from smdebug.core.modes import ModeKeys

From 44358ee6f7aeebc82e9aa3707884dd3208da0891 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 12 Aug 2020 11:52:28 -0700
Subject: [PATCH 09/31] Add tests for TF

---
 smdebug/core/writer.py                        | 11 ++++---
 smdebug/tensorflow/base_hook.py               |  9 ++++--
 tests/pytorch/test_reduce_config.py           | 18 ++---------
 .../tensorflow/hooks/test_estimator_modes.py  | 27 +++++++++++++++-
 tests/tensorflow/hooks/test_reductions.py     | 16 ++++++++++
 tests/tensorflow2/test_keras.py               | 31 +++++++++----------
 tests/utils.py                                | 20 ++++++++++++
 7 files changed, 91 insertions(+), 41 deletions(-)

diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py
index abb606632..5d5cb93d5 100644
--- a/smdebug/core/writer.py
+++ b/smdebug/core/writer.py
@@ -256,7 +256,9 @@ def flush(self):
         if not self._writer:
             raise ValueError(f"Cannot flush because self._writer={self._writer}")
         if not self.shapes:
-            raise ValueError(f"Cannot write shapes to file {self.file_path} as it is empty")
+            raise ValueError(
+                f"Cannot write shapes to file {self.file_path} as it is empty. {self.shapes}"
+            )
 
         s = json.dumps({"meta": self.meta, "payload": self.shapes})
         self._writer.write(s)
@@ -268,6 +270,7 @@ def close(self):
         """Flushes the event file to disk and close the file.
         """
         if self._writer is not None:
-            self.flush()
-            self._writer.close()
-            self._writer = None
+            if self.shapes:
+                self.flush()
+                self._writer.close()
+                self._writer = None
diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py
index 42b9b07d0..0cf30bf48 100644
--- a/smdebug/tensorflow/base_hook.py
+++ b/smdebug/tensorflow/base_hook.py
@@ -342,9 +342,12 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
             raise NotImplementedError
 
         if self._saving_shapes_in_step():
-            self.shape_writer = ShapeWriter(
-                trial_dir=self.out_dir, step=self.step, worker=self.worker
-            )
+            if self.shape_writer is None or only_initialize_if_missing is False:
+                self.shape_writer = ShapeWriter(
+                    trial_dir=self.out_dir, step=self.step, worker=self.worker
+                )
+        else:
+            assert self.shape_writer is None
 
     def _close_writers(self) -> None:
         if self.dry_run:
diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py
index c7b525895..7390f335f 100644
--- a/tests/pytorch/test_reduce_config.py
+++ b/tests/pytorch/test_reduce_config.py
@@ -1,5 +1,4 @@
 # Standard Library
-import json
 import os
 import shutil
 from datetime import datetime
@@ -9,10 +8,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+from tests.utils import verify_shapes
 
 # First Party
-from smdebug.core.config_constants import DEFAULT_WORKER_NAME
-from smdebug.core.locations import ShapeFileLocation
 from smdebug.pytorch import ReductionConfig, SaveConfig
 from smdebug.pytorch.hook import Hook as t_hook
 from smdebug.trials import create_trial
@@ -151,19 +149,7 @@ def forward(self, x):
     hook.register_module(model)
     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
     train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
-
-    sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME)
-    path = os.path.join(out_dir, sl.get_file_location())
-    with open(path) as jsfile:
-        shape_dict = json.load(jsfile)
-    print(shape_dict["payload"])
-    assert "payload" in shape_dict
-    assert len(shape_dict["payload"]) == 41
-    for ts in shape_dict["payload"]:
-        for dim in ts["shape"]:
-            assert isinstance(dim, int)
-        assert isinstance(ts["name"], str)
-
+    verify_shapes(out_dir, 0, 4)
     if hook_created:
         shutil.rmtree(out_dir)
 
diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py
index e7bd40945..fe481ace4 100644
--- a/tests/tensorflow/hooks/test_estimator_modes.py
+++ b/tests/tensorflow/hooks/test_estimator_modes.py
@@ -18,6 +18,7 @@
 import pytest
 import tensorflow as tf
 from tests.analysis.utils import delete_s3_prefix
+from tests.utils import verify_shapes
 
 # First Party
 import smdebug.tensorflow as smd
@@ -30,10 +31,12 @@
 def help_test_mnist(
     path,
     save_config=None,
+    reduction_config=None,
     hook=None,
     set_modes=True,
     num_steps=10,
     num_eval_steps=None,
+    save_all=False,
     steps=None,
     include_collections=None,
 ):
@@ -125,7 +128,11 @@ def cnn_model_fn(features, labels, mode):
         if include_collections is None:
             include_collections = ["weights", "gradients", "default", "losses"]
         hook = smd.SessionHook(
-            out_dir=trial_dir, save_config=save_config, include_collections=include_collections
+            out_dir=trial_dir,
+            save_config=save_config,
+            include_collections=include_collections,
+            save_all=save_all,
+            reduction_config=reduction_config,
         )
 
     if num_eval_steps is None:
@@ -187,6 +194,24 @@ def test_mnist(out_dir, on_s3=False):
     helper_test_mnist_trial(out_dir)
 
 
+@pytest.mark.slow  # 0:02 to run
+def test_mnist_shapes(out_dir, on_s3=False):
+    if on_s3:
+        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
+        bucket = "smdebug-testing"
+        prefix = "outputs/hooks/estimator_modes/" + run_id
+        out_dir = f"s3://{bucket}/{prefix}"
+    help_test_mnist(
+        out_dir,
+        save_all=True,
+        save_config=smd.SaveConfig(save_steps=[0]),
+        num_steps=1,
+        steps=None,
+        reduction_config=smd.ReductionConfig(save_shape=True),
+    )
+    verify_shapes(out_dir, 0, 249)
+
+
 @pytest.mark.slow  # 0:02 to run
 def test_mnist_local_json(out_dir, monkeypatch):
     monkeypatch.setenv(
diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py
index 4fde66a49..b27589ff6 100644
--- a/tests/tensorflow/hooks/test_reductions.py
+++ b/tests/tensorflow/hooks/test_reductions.py
@@ -1,5 +1,8 @@
 # Standard Library
 
+# Third Party
+from tests.tensorflow2.utils import verify_shapes
+
 # First Party
 import smdebug.tensorflow as smd
 from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
@@ -57,6 +60,19 @@ def test_reductions(out_dir, save_raw_tensor=False):
     helper_test_reductions(out_dir, hook, save_raw_tensor)
 
 
+def test_shapes(out_dir, save_raw_tensor=False):
+    pre_test_clean_up()
+    rdnc = smd.ReductionConfig(save_shape=True, save_raw_tensor=save_raw_tensor)
+    hook = smd.SessionHook(
+        out_dir=out_dir,
+        save_config=smd.SaveConfig(save_interval=1),
+        reduction_config=rdnc,
+        include_collections=["weights", "gradients", "losses"],
+    )
+    simple_model(hook)
+    verify_shapes(out_dir, 0, 2)
+
+
 def test_reductions_with_raw_tensor(out_dir):
     test_reductions(out_dir, save_raw_tensor=True)
 
diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index f69a5a241..6e9d932ec 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -16,15 +16,13 @@
 from tests.constants import TEST_DATASET_S3_PATH
 from tests.tensorflow2.utils import is_tf_2_2, is_tf_2_3
 from tests.tensorflow.utils import create_trial_fast_refresh
-from tests.utils import use_s3_datasets
+from tests.utils import use_s3_datasets, verify_shapes
 
 # First Party
 import smdebug.tensorflow as smd
 from smdebug.core.access_layer import has_training_ended
 from smdebug.core.collection import CollectionKeys
-from smdebug.core.config_constants import DEFAULT_WORKER_NAME
 from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
-from smdebug.core.locations import ShapeFileLocation
 from smdebug.core.modes import ModeKeys
 from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS
 from smdebug.exceptions import TensorUnavailableForStep
@@ -193,20 +191,8 @@ def test_keras_gradtape_shapes(out_dir):
         reduction_config=ReductionConfig(save_shape=True),
     )
     helper_keras_gradtape(trial_dir=out_dir, hook=hook)
-    sl = ShapeFileLocation(0, DEFAULT_WORKER_NAME)
-    path = os.path.join(out_dir, sl.get_file_location())
-    with open(path) as jsfile:
-        shape_dict = json.load(jsfile)
-    print(shape_dict["payload"])
-    assert "payload" in shape_dict
-    assert len(shape_dict["payload"]) == 41
-    for ts in shape_dict["payload"]:
-        for dim in ts["shape"]:
-            assert isinstance(dim, int)
-        assert isinstance(ts["name"], str)
-
-    if hook_created:
-        shutil.rmtree(out_dir)
+    verify_shapes(out_dir, 0, 9)
+    verify_shapes(out_dir, 500, 14)
 
 
 @pytest.mark.skip_if_non_eager
@@ -479,6 +465,17 @@ def test_keras_fit(out_dir, tf_eager_mode, saveall):
         assert trial.tensor(tname).value(0) is not None
 
 
+def test_keras_fit_shapes(out_dir):
+    hook = smd.KerasHook(
+        out_dir=out_dir,
+        save_all=True,
+        save_config=SaveConfig(save_steps=[0]),
+        reduction_config=ReductionConfig(save_shape=True),
+    )
+    helper_keras_fit(trial_dir=out_dir, hook=hook)
+    verify_shapes(out_dir, 0, 9)
+
+
 @pytest.mark.slow
 def test_base_reductions(out_dir, tf_eager_mode):
     helper_keras_fit(
diff --git a/tests/utils.py b/tests/utils.py
index d5db2a8ba..4472cf7bc 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,5 @@
 # Standard Library
+import json
 import os
 import shutil
 from pathlib import Path
@@ -12,8 +13,10 @@
     CONFIG_FILE_PATH_ENV_STR,
     DEFAULT_SAGEMAKER_OUTDIR,
     DEFAULT_SAGEMAKER_TENSORBOARD_PATH,
+    DEFAULT_WORKER_NAME,
     TENSORBOARD_CONFIG_FILE_PATH_ENV_STR,
 )
+from smdebug.core.locations import ShapeFileLocation
 from smdebug.core.utils import is_s3, remove_file_if_exists
 
 
@@ -27,6 +30,23 @@ def use_s3_datasets():
         return False
 
 
+def verify_shapes(out_dir, step_num, num_tensors):
+    sl = ShapeFileLocation(step_num, DEFAULT_WORKER_NAME)
+    path = os.path.join(out_dir, sl.get_file_location())
+    with open(path) as jsfile:
+        shape_dict = json.load(jsfile)
+    print(shape_dict["payload"])
+    assert "payload" in shape_dict
+    assert len(shape_dict["payload"]) == num_tensors, (
+        len(shape_dict["payload"]),
+        shape_dict["payload"],
+    )
+    for ts in shape_dict["payload"]:
+        for dim in ts["shape"]:
+            assert isinstance(dim, int)
+        assert isinstance(ts["name"], str)
+
+
 class SagemakerSimulator(object):
     """
     Creates an environment variable pointing to a JSON config file, and creates the config file.

From f146c77bcdc0f0020caf3b56f8c197a699ed4098 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 12 Aug 2020 17:17:29 -0700
Subject: [PATCH 10/31] Simplify read code

---
 smdebug/trials/local_trial.py |  2 +-
 smdebug/trials/s3_trial.py    |  2 +-
 smdebug/trials/trial.py       | 18 +++---------------
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/smdebug/trials/local_trial.py b/smdebug/trials/local_trial.py
index 0fa4e76ba..e8cfbe51e 100644
--- a/smdebug/trials/local_trial.py
+++ b/smdebug/trials/local_trial.py
@@ -34,7 +34,7 @@ def __init__(
         self.index_reader = LocalIndexReader(self.path)
         self.logger.info(f"Loading trial {name} at path {self.trial_dir}")
         self._load_collections()
-        self._load_tensors()
+        self.refresh_data()
 
     def _get_collection_files(self) -> list:
         return list_collection_files_in_directory(get_path_to_collections(self.path))
diff --git a/smdebug/trials/s3_trial.py b/smdebug/trials/s3_trial.py
index 0a3cb6389..c2751fb40 100644
--- a/smdebug/trials/s3_trial.py
+++ b/smdebug/trials/s3_trial.py
@@ -45,7 +45,7 @@ def __init__(
         self.path = "s3://" + os.path.join(self.bucket_name, self.prefix_name)
         self.index_reader = S3IndexReader(self.path)
         self._load_collections()
-        self._load_tensors()
+        self.refresh_data()
 
     def _get_collection_files(self) -> list:
         collection_files, _ = list_s3_objects(
diff --git a/smdebug/trials/trial.py b/smdebug/trials/trial.py
index 5008ef6a7..2d527305e 100644
--- a/smdebug/trials/trial.py
+++ b/smdebug/trials/trial.py
@@ -190,7 +190,7 @@ def maybe_refresh(self, name=None):
             retry_count = 2
         while retry_count > 0:
             if name is None:
-                self.refresh_tensors()
+                self.refresh_data()
             else:
                 self.refresh_tensor(name)
             if retry_count > 1:
@@ -215,7 +215,7 @@ def maybe_refresh(self, name=None):
 
     def refresh_tensor(self, tname, steps=None):
         # for now we load all tensors at once
-        self.refresh_tensors()
+        self.refresh_data()
 
     def tensor(self, tname):
         # will not show tensor if it was not written yet
@@ -560,10 +560,6 @@ def has_passed_step(self, step, mode=ModeKeys.GLOBAL) -> StepState:
             return StepState.UNAVAILABLE
         return StepState.NOT_YET_AVAILABLE
 
-    def _load_tensors(self):
-        if self.index_mode:
-            self._load_tensors_from_index_files()
-
     def _update_last_index_token(self, new_index_token: str) -> None:
         """
         This function updates the last_index_token in the following scenarios:
@@ -625,15 +621,7 @@ def _update_last_index_token(self, new_index_token: str) -> None:
                 f"Updating last_complete_step to: {self.last_complete_step}. "
             )
 
-    def _load_tensors_from_index_files(self):
-        self.index_tensors_dict, new_index_token = self.index_reader.load_tensor_data_from_index_files(
-            start_after_key=self.last_index_token, range_steps=self.range_steps
-        )
-        self._load_tensors_from_index_tensors(self.index_tensors_dict)
-        if new_index_token:  # new index token can be None if there are no new index files
-            self._update_last_index_token(new_index_token)
-
-    def refresh_tensors(self):
+    def refresh_data(self):
         # TODO if job finished
         if self.index_mode:
             index_tensors_dict, new_index_token = self.index_reader.load_tensor_data_from_index_files(

From 5906e5aeaff65323080129806efcf9af2bfced79 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Thu, 13 Aug 2020 16:11:41 -0700
Subject: [PATCH 11/31] Add read API and tests

---
 smdebug/core/hook.py                      | 18 +++--
 smdebug/core/index_reader.py              | 48 +++++++++----
 smdebug/core/locations.py                 | 36 ++++------
 smdebug/core/reduction_config.py          | 24 +++++--
 smdebug/core/tensor.py                    | 35 +++++++++-
 smdebug/core/tfevent/index_file_writer.py | 36 +++++++---
 smdebug/core/writer.py                    | 69 +++++++++----------
 smdebug/exceptions.py                     | 15 ++++
 smdebug/tensorflow/base_hook.py           | 38 +++++++---
 smdebug/trials/local_trial.py             |  6 --
 smdebug/trials/s3_trial.py                |  6 --
 smdebug/trials/trial.py                   | 84 ++++++++++++++---------
 tests/pytorch/test_reduce_config.py       |  2 +-
 tests/tensorflow/keras/test_keras.py      | 15 ++++
 tests/tensorflow2/test_keras.py           |  6 +-
 tests/utils.py                            | 22 ++----
 16 files changed, 296 insertions(+), 164 deletions(-)

diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py
index 1cd8294d8..5256c9adf 100644
--- a/smdebug/core/hook.py
+++ b/smdebug/core/hook.py
@@ -463,11 +463,15 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
         if self.save_all_workers is False:
             if self.worker != self.chief_worker:
                 return
+
         self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
 
         if self._saving_shapes_in_step():
             self.shape_writer = ShapeWriter(
-                trial_dir=self.out_dir, step=self.step, worker=self.worker
+                trial_dir=self.out_dir,
+                step=self.step,
+                worker=self.worker,
+                index_writer=self.writer.index_writer,
             )
 
     def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
@@ -747,10 +751,16 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N
                 numpy_tensor_value = self._make_numpy_array(tensor_value)
                 this_size, this_shape = size_and_shape(numpy_tensor_value)
                 if tensor_ref is not None:
-                    name = tensor_ref.tf_obj.name
+                    original_name = tensor_ref.tf_obj.name
                 else:
-                    name = tensor_name
-                self.shape_writer.write_shape(name, this_shape)
+                    original_name = None
+                self.shape_writer.write_shape(
+                    tensor_name,
+                    this_shape,
+                    self.mode,
+                    self.mode_steps[self.mode],
+                    original_name=original_name,
+                )
                 break
 
     def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None):
diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py
index 3b23e1671..58256dd65 100644
--- a/smdebug/core/index_reader.py
+++ b/smdebug/core/index_reader.py
@@ -16,7 +16,7 @@
     MISSING_EVENT_FILE_RETRY_LIMIT,
     MISSING_EVENT_FILE_RETRY_LIMIT_KEY,
 )
-from smdebug.core.locations import IndexFileLocationUtils, TensorLocation
+from smdebug.core.locations import IndexFileLocationUtils, TensorLocation, TensorShape
 from smdebug.core.logger import get_logger
 from smdebug.core.modes import ModeKeys
 from smdebug.core.s3_utils import list_s3_objects
@@ -203,8 +203,10 @@ def _validate(index_dict):
             raise IndexReaderException("meta section is not present")
         if len(index_dict["meta"]) == 0:
             raise IndexReaderException("meta section is empty")
-        if "tensor_payload" not in index_dict:
-            raise IndexReaderException("tensor_payload section is not present")
+        if "tensor_payload" not in index_dict and "shape_payload" not in index_dict:
+            raise IndexReaderException(
+                "neither tensor_payload nor shape_payload sections are present"
+            )
 
     def _update_tensors_from_json(
         self, index_tensors_dict, step, response: bytes, path, worker
@@ -233,28 +235,44 @@ def _update_tensors_from_json(
         mode = index_meta["mode"]
         mode = ModeKeys[mode.strip()]
         mode_step = index_meta["mode_step"]
-        event_file_name = os.path.join(path, index_meta["event_file_name"])
-        tensors = index_dict["tensor_payload"]
-        for tensor in tensors:
+
+        if "event_file_name" in index_meta:
+            event_file_name = os.path.join(path, index_meta["event_file_name"])
+        else:
+            event_file_name = None
+
+        tensor_payload = index_dict["tensor_payload"]
+        to_update_index_dict = []
+        for tensor in tensor_payload:
             tensor_name = tensor["tensorname"]
             start_idx = tensor["start_idx"]
             length = tensor["length"]
             tensor_location = TensorLocation(
                 tensor_name, mode, mode_step, event_file_name, start_idx, length, worker
             )
+            to_update_index_dict.append((tensor_name, step, tensor_location))
+
+        shape_payload = index_dict["shape_payload"]
+        for tensor in shape_payload:
+            tensor_name = tensor["tensorname"]
+            original_name = tensor["originalname"]
+            shape = tensor["shape"]
+            ts = TensorShape(tensor_name, mode, mode_step, shape, original_name)
+            to_update_index_dict.append((tensor_name, step, ts))
+
+        for tu in to_update_index_dict:
+            tensor_name, step, obj = tu
+            if isinstance(obj, TensorLocation):
+                obj_dict = {"tensor_location": obj}
+            elif isinstance(obj, TensorShape):
+                obj_dict = {"tensor_shape": obj}
             if tensor_name in index_tensors_dict:
                 if step in index_tensors_dict[tensor_name]:
-                    index_tensors_dict[tensor_name][step].update(
-                        {worker: {"tensor_location": tensor_location}}
-                    )
+                    index_tensors_dict[tensor_name][step].update({worker: obj_dict})
                 else:
-                    index_tensors_dict[tensor_name].update(
-                        {step: {worker: {"tensor_location": tensor_location}}}
-                    )
+                    index_tensors_dict[tensor_name].update({step: {worker: obj_dict}})
             else:
-                index_tensors_dict[tensor_name] = {
-                    step: {worker: {"tensor_location": tensor_location}}
-                }
+                index_tensors_dict[tensor_name] = {step: {worker: obj_dict}}
         return index_tensors_dict
 
 
diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py
index 746a7a6cd..9712f58a2 100644
--- a/smdebug/core/locations.py
+++ b/smdebug/core/locations.py
@@ -24,6 +24,20 @@ def to_dict(self):
         return {"tensorname": self.tensorname, "start_idx": self.start_idx, "length": self.length}
 
 
+class TensorShape:
+    def __init__(self, name, mode, mode_step, shape, original_name=None):
+        if original_name is None:
+            original_name = name
+        self.name = name
+        self.original_name = original_name
+        self.mode = mode
+        self.mode_step = mode_step
+        self.shape = tuple(shape)
+
+    def to_dict(self):
+        return {"tensorname": self.name, "originalname": self.original_name, "shape": self.shape}
+
+
 STEP_NUMBER_FORMATTING_LENGTH = "012"
 
 
@@ -89,28 +103,6 @@ def get_step_dir_path(cls, trial_dir, step_num):
         return os.path.join(cls.get_dir(trial_dir), format(step_num, STEP_NUMBER_FORMATTING_LENGTH))
 
 
-class ShapeFileLocation(TensorFileLocation):
-    def __init__(self, step_num, worker_name):
-        super().__init__(step_num, worker_name)
-
-    def get_filename(self):
-        step_num_str = self.get_step_num_str()
-        return f"{step_num_str}_{self.worker_name}_shapes.json"
-
-    @classmethod
-    def load_filename(cls, s, print_error=True):
-        name = os.path.basename(s)
-        m = re.search("(.*)_(.*)_shapes.json$", name)
-        if m:
-            step_num = int(m.group(1))
-            worker_name = m.group(2)
-            return cls(step_num=step_num, worker_name=worker_name)
-        else:
-            if print_error:
-                logger.error("Failed to load shape file location: ", s)
-            return None
-
-
 class TensorboardFileLocation(EventFileLocation):
     def __init__(self, step_num, worker_name, mode=None):
         super().__init__(step_num, worker_name)
diff --git a/smdebug/core/reduction_config.py b/smdebug/core/reduction_config.py
index 6ad78cdee..1fa6121b6 100644
--- a/smdebug/core/reduction_config.py
+++ b/smdebug/core/reduction_config.py
@@ -1,16 +1,25 @@
 # Standard Library
 import json
 from typing import Any, Dict
-from smdebug.core.logger import get_logger
-logger = get_logger()
 
 # First Party
+from smdebug.core.logger import get_logger
 from smdebug.core.utils import split
 
+logger = get_logger()
+
+
 ALLOWED_REDUCTIONS = ["min", "max", "mean", "std", "variance", "sum", "prod"]
 ALLOWED_NORMS = ["l1", "l2"]
 REDUCTION_CONFIG_VERSION_NUM = "v0"
-ALLOWED_PARAMS = ["reductions", "abs_reductions", "norms", "abs_norms", "save_raw_tensor", "save_shape"]
+ALLOWED_PARAMS = [
+    "reductions",
+    "abs_reductions",
+    "norms",
+    "abs_norms",
+    "save_raw_tensor",
+    "save_shape",
+]
 
 
 class ReductionConfig:
@@ -82,7 +91,6 @@ def _check(self):
         if not isinstance(self.save_shape, bool):
             raise ValueError(f"save_shape={self.save_shape} must be a boolean")
 
-
     @classmethod
     def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig":
         """Parses a flattened dict with two keys: `save_raw_tensor` and `reductions`."""
@@ -115,7 +123,7 @@ def from_dict(cls, params: Dict[str, Any]) -> "ReductionConfig":
             norms=norms,
             abs_norms=abs_norms,
             save_raw_tensor=save_raw_tensor,
-            save_shape=save_shape
+            save_shape=save_shape,
         )
 
     @classmethod
@@ -136,7 +144,11 @@ def to_json_dict(self) -> Dict[str, Any]:
             all_reductions.append(f"abs_{red}_norm")
         all_reductions_str = ",".join(all_reductions)
         # Return the dict
-        return {"save_raw_tensor": self.save_raw_tensor, "reductions": all_reductions_str, "save_shape": self.save_shape}
+        return {
+            "save_raw_tensor": self.save_raw_tensor,
+            "reductions": all_reductions_str,
+            "save_shape": self.save_shape,
+        }
 
     def to_json(self) -> str:
         return json.dumps(self.to_json_dict())
diff --git a/smdebug/core/tensor.py b/smdebug/core/tensor.py
index c268c48fc..de52f7858 100644
--- a/smdebug/core/tensor.py
+++ b/smdebug/core/tensor.py
@@ -10,6 +10,7 @@
 from smdebug.exceptions import (
     InvalidWorker,
     NoMoreData,
+    ShapeUnavailableForStep,
     StepNotYetAvailable,
     StepUnavailable,
     TensorUnavailableForStep,
@@ -62,6 +63,16 @@ def set_step_location(self, step_num, worker, location):
         s = self._steps[step_num][worker]
         s.location = location
 
+    def set_step_shape(self, step_num, worker, shape):
+        step = Step(step_num, shape=shape)
+        if step_num not in self._steps:
+            self._steps[step_num] = {worker: step}
+        elif worker not in self._steps[step_num]:
+            self._steps[step_num].update({worker: step})
+
+        s = self._steps[step_num][worker]
+        s.shape = shape
+
     def set_step_reduction_value(self, step_num, worker, red_name, abs, red_value):
         if step_num not in self._steps:
             s = Step(step_num)
@@ -88,10 +99,11 @@ def step(self, step_num):
 class Step:
     """Contains the step number, value, location, and reduction values/locations."""
 
-    def __init__(self, step_num, value=None, location=None):
+    def __init__(self, step_num, value=None, location=None, shape=None):
         self.step_num = step_num
         self.value = value
         self.location = location
+        self.shape = shape
 
         # mapping from (red_name, abs) to value
         self._reduction_values = {}
@@ -126,6 +138,9 @@ class Tensor:
     def __init__(self, name, trial, cache):
         self._mode_steps = {}
         self.name = name
+        # SMdebug modifies some names of tensors to be more descriptive
+        # In such cases we save here the original name
+        self.original_name = None
         self.trial = trial
         self.cache = cache
 
@@ -264,6 +279,16 @@ def value(self, step_num, mode=ModeKeys.GLOBAL, worker=None):
             has_reductions = has_reduction_locations or has_reduction_values
             raise TensorUnavailableForStep(self.name, step_num, mode, has_reductions)
 
+    def shape(self, step_num, mode=ModeKeys.GLOBAL, worker=None):
+        s = self._step(step_num=step_num, mode=mode, worker=worker)
+        if s.shape is not None:
+            return s.shape
+        try:
+            value = self.value(step_num, mode, worker)
+            return value.shape
+        except TensorUnavailableForStep:
+            raise ShapeUnavailableForStep(self.name, step_num, mode)
+
     def reduction_values(self, step_num, mode=ModeKeys.GLOBAL, worker=None):
         s = self._step(step_num=step_num, mode=mode, worker=worker)
         if s is not None:
@@ -334,9 +359,13 @@ def _create_mode_step(self, mode, mode_step):
         if mode not in self._mode_steps:
             self._mode_steps[mode] = ModeSteps(mode)
 
-    def add_step(self, mode, mode_step, worker, location):
+    def add_step(self, mode, mode_step, worker, tensor_location, tensor_shape):
         self._create_mode_step(mode, mode_step)
-        self._mode_steps[mode].set_step_location(mode_step, worker, location)
+        if tensor_location is not None:
+            self._mode_steps[mode].set_step_location(mode_step, worker, tensor_location)
+        if tensor_shape is not None:
+            self._mode_steps[mode].set_step_shape(mode_step, worker, tensor_shape.shape)
+            self.original_name = tensor_shape.original_name
 
     def add_reduction_step(self, mode, mode_step, worker, red_name, abs, red_location):
         self._create_mode_step(mode, mode_step)
diff --git a/smdebug/core/tfevent/index_file_writer.py b/smdebug/core/tfevent/index_file_writer.py
index 80cb54b68..886c7760e 100644
--- a/smdebug/core/tfevent/index_file_writer.py
+++ b/smdebug/core/tfevent/index_file_writer.py
@@ -13,6 +13,7 @@ def __init__(self, file_path):
         self.file_path = file_path
         self.index_payload = []
         self.index_meta = {}
+        self.shape_payload = []
         self.writer = None
 
     def __exit__(self):
@@ -28,7 +29,7 @@ def _init_writer(self):
     def add_index(self, tensorlocation):
         if not self.writer:
             self._init_writer()
-        if not self.index_meta:
+        if not self.index_meta or not "event_file_name" in self.index_meta:
             self.index_meta = {
                 "mode": tensorlocation.mode,
                 "mode_step": tensorlocation.mode_step,
@@ -36,6 +37,13 @@ def add_index(self, tensorlocation):
             }
         self.index_payload.append(tensorlocation.to_dict())
 
+    def add_shape(self, tensorshape):
+        if not self.writer:
+            self._init_writer()
+        if not self.index_meta:
+            self.index_meta = {"mode": tensorshape.mode, "mode_step": tensorshape.mode_step}
+        self.shape_payload.append(tensorshape.to_dict())
+
     def flush(self):
         """Flushes the event string to file."""
         if not self.writer:
@@ -44,33 +52,45 @@ def flush(self):
             raise ValueError(
                 f"Cannot write empty index_meta={self.index_meta} to file {self.file_path}"
             )
-        if not self.index_payload:
+        if not self.index_payload and not self.shape_payload:
             raise ValueError(
-                f"Cannot write empty index_payload={self.index_payload} to file {self.file_path}"
+                f"Cannot write empty payload: index_payload={self.index_payload}, shape_payload={self.shape_payload} to file {self.file_path}"
             )
 
-        index = Index(meta=self.index_meta, tensor_payload=self.index_payload)
+        index = Index(
+            meta=self.index_meta,
+            tensor_payload=self.index_payload,
+            shape_payload=self.shape_payload,
+        )
         self.writer.write(index.to_json())
         self.writer.flush()
         self.index_meta = {}
-        self.index_payload = {}
+        self.index_payload = []
+        self.shape_payload = []
 
     def close(self):
         """Closes the record writer."""
         if self.writer is not None:
-            if self.index_meta and self.index_payload:
+            if self.index_meta and (self.index_payload or self.shape_payload):
                 self.flush()
                 self.writer.close()
                 self.writer = None
 
 
 class Index:
-    def __init__(self, meta=None, tensor_payload=None):
+    def __init__(self, meta=None, tensor_payload=None, shape_payload=None):
         self.meta = meta
         self.tensor_payload = tensor_payload
+        self.shape_payload = shape_payload
 
     def to_json(self):
-        return json.dumps({"meta": self.meta, "tensor_payload": self.tensor_payload})
+        return json.dumps(
+            {
+                "meta": self.meta,
+                "tensor_payload": self.tensor_payload,
+                "shape_payload": self.shape_payload,
+            }
+        )
 
 
 class EventWithIndex(object):
diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py
index 5d5cb93d5..4932e72e0 100644
--- a/smdebug/core/writer.py
+++ b/smdebug/core/writer.py
@@ -39,9 +39,9 @@
 # Local
 from .locations import (
     IndexFileLocationUtils,
-    ShapeFileLocation,
     TensorboardFileLocation,
     TensorFileLocation,
+    TensorShape,
 )
 from .logger import get_logger
 from .modes import ModeKeys
@@ -58,6 +58,7 @@ def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL):
             assert False, "Worker should not be none. Check worker name initialization"
         self.mode = mode
         self._writer = None
+        self._index_writer = None
 
     def name(self):
         return self._writer.name()
@@ -77,6 +78,21 @@ def flush(self):
         self._writer.flush()
         # don't flush index writer as we only want to flush on close
 
+    @classmethod
+    def create_index_writer(cls, trial_dir, worker, step):
+        el = TensorFileLocation(step_num=step, worker_name=worker)
+        event_file_path = el.get_file_location(trial_dir=trial_dir)
+        index_file_path = IndexFileLocationUtils.get_index_key_for_step(trial_dir, step, worker)
+        return IndexWriter(index_file_path)
+
+    @property
+    def index_writer(self):
+        return self._index_writer
+
+    @index_writer.setter
+    def index_writer(self, iw):
+        self._index_writer = iw
+
 
 class FileWriter(BaseWriter):
     def __init__(
@@ -90,6 +106,7 @@ def __init__(
         flush_secs=120,
         verbose=False,
         write_checksum=False,
+        index_writer=None,
     ):
         """Creates a `FileWriter` and an  file.
         On construction the summary writer creates a new event file in `trial_dir`.
@@ -113,12 +130,14 @@ def __init__(
         """
         super(FileWriter, self).__init__(trial_dir, worker, step, mode)
         if wtype == "events":
+            if index_writer is None:
+                self.index_writer = self.create_index_writer(
+                    trial_dir=trial_dir, worker=worker, step=step
+                )
+            else:
+                self.index_writer = index_writer
             el = TensorFileLocation(step_num=self.step, worker_name=self.worker)
             event_file_path = el.get_file_location(trial_dir=self.trial_dir)
-            index_file_path = IndexFileLocationUtils.get_index_key_for_step(
-                self.trial_dir, self.step, self.worker
-            )
-            self.index_writer = IndexWriter(index_file_path)
         elif wtype == "tensorboard":
             el = TensorboardFileLocation(
                 step_num=self.step, worker_name=self.worker, mode=self.mode
@@ -236,41 +255,21 @@ def _check_mode_step(mode, mode_step, global_step):
 
 
 class ShapeWriter(BaseWriter):
-    def __init__(self, trial_dir, worker, step=0, mode=ModeKeys.GLOBAL):
+    def __init__(self, trial_dir, worker, index_writer, step=0, mode=ModeKeys.GLOBAL):
         super(ShapeWriter, self).__init__(trial_dir, worker, step, mode)
-        el = ShapeFileLocation(step_num=self.step, worker_name=self.worker)
-        self.file_path = el.get_file_location(trial_dir=self.trial_dir)
-        s3, bucket_name, key_name = is_s3(self.file_path)
-        if s3:
-            self._writer = TSAccessS3(bucket_name, key_name, binary=False)
-        else:
-            self._writer = TSAccessFile(self.file_path, "a+")
+        self._index_writer = index_writer
 
-        self.shapes = []
-        self.meta = {}
-
-    def write_shape(self, name, shape: Tuple[int]):
-        self.shapes.append({"name": name, "shape": shape})
+    def write_shape(
+        self, name, shape: Tuple[int], mode=ModeKeys.GLOBAL, mode_step=None, original_name=None
+    ):
+        self._index_writer.add_shape(
+            TensorShape(name, mode, mode_step, shape, original_name=original_name)
+        )
 
     def flush(self):
-        if not self._writer:
-            raise ValueError(f"Cannot flush because self._writer={self._writer}")
-        if not self.shapes:
-            raise ValueError(
-                f"Cannot write shapes to file {self.file_path} as it is empty. {self.shapes}"
-            )
-
-        s = json.dumps({"meta": self.meta, "payload": self.shapes})
-        self._writer.write(s)
-        self._writer.flush()
-        self.meta = {}
-        self.shapes = []
+        self._index_writer.flush()
 
     def close(self):
         """Flushes the event file to disk and close the file.
         """
-        if self._writer is not None:
-            if self.shapes:
-                self.flush()
-                self._writer.close()
-                self._writer = None
+        self._index_writer.close()
diff --git a/smdebug/exceptions.py b/smdebug/exceptions.py
index 6d917e6bf..e2ed43da9 100644
--- a/smdebug/exceptions.py
+++ b/smdebug/exceptions.py
@@ -68,6 +68,21 @@ def __str__(self):
         return msg
 
 
+class ShapeUnavailableForStep(Exception):
+    def __init__(self, tname, step, mode=modes.GLOBAL):
+        self.step = step
+        self.mode = mode
+        self.tname = tname
+
+    def __str__(self):
+        msg = (
+            "Shape for tensor {} is not available for step {} "
+            "with mode {} as it was not saved."
+            "".format(self.tname, self.step, self.mode.name)
+        )
+        return msg
+
+
 class TensorUnavailable(Exception):
     def __init__(self, tname):
         self.tname = tname
diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py
index 0cf30bf48..02c69b36f 100644
--- a/smdebug/tensorflow/base_hook.py
+++ b/smdebug/tensorflow/base_hook.py
@@ -86,6 +86,7 @@ def __init__(
                 Example -> /job:worker/replica:0/task:1/device:GPU:0 : _job-worker_replica-0_task-1_device-GPU-0"""
         self.device_map = {}
         self.writer_map = {}
+        self.shape_writer_map = {}
         # This will be None if the var wasn't set, i.e. not param server
         self.tf_config_json = load_tf_config_json(os.getenv("TF_CONFIG"))
         self._hook_supported = None
@@ -320,6 +321,13 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
                     self.writer = FileWriter(
                         trial_dir=self.out_dir, step=self.step, worker=self.worker
                     )
+                if self._saving_shapes_in_step():
+                    self.shape_writer = ShapeWriter(
+                        trial_dir=self.out_dir,
+                        step=self.step,
+                        worker=self.worker,
+                        index_writer=self.writer.index_writer,
+                    )
         elif self.distribution_strategy == TFDistributionStrategy.MIRRORED:
             if len(self.device_map):
                 for device, device_string in self.device_map.items():
@@ -329,26 +337,40 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
                         self.writer_map[device_string] = FileWriter(
                             trial_dir=self.out_dir, step=self.step, worker=device_string
                         )
+                        if self._saving_shapes_in_step():
+                            self.shape_writer[device_string] = ShapeWriter(
+                                trial_dir=self.out_dir,
+                                step=self.step,
+                                worker=self.worker,
+                                index_writer=self.writer_map[device_string].index_writer,
+                            )
             else:
                 # training on CPU when all device strings have cpu
                 if self.writer is None or only_initialize_if_missing is False:
                     self.writer = FileWriter(
                         trial_dir=self.out_dir, step=self.step, worker=self.worker
                     )
+                    if self._saving_shapes_in_step():
+                        self.shape_writer = ShapeWriter(
+                            trial_dir=self.out_dir,
+                            step=self.step,
+                            worker=self.worker,
+                            index_writer=self.writer.index_writer,
+                        )
+
         elif self.distribution_strategy == TFDistributionStrategy.NONE:
             if self.writer is None or only_initialize_if_missing is False:
                 self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
+                if self._saving_shapes_in_step():
+                    self.shape_writer = ShapeWriter(
+                        trial_dir=self.out_dir,
+                        step=self.step,
+                        worker=self.worker,
+                        index_writer=self.writer.index_writer,
+                    )
         else:
             raise NotImplementedError
 
-        if self._saving_shapes_in_step():
-            if self.shape_writer is None or only_initialize_if_missing is False:
-                self.shape_writer = ShapeWriter(
-                    trial_dir=self.out_dir, step=self.step, worker=self.worker
-                )
-        else:
-            assert self.shape_writer is None
-
     def _close_writers(self) -> None:
         if self.dry_run:
             return
diff --git a/smdebug/trials/local_trial.py b/smdebug/trials/local_trial.py
index e8cfbe51e..272acff83 100644
--- a/smdebug/trials/local_trial.py
+++ b/smdebug/trials/local_trial.py
@@ -39,12 +39,6 @@ def __init__(
     def _get_collection_files(self) -> list:
         return list_collection_files_in_directory(get_path_to_collections(self.path))
 
-    def _load_tensors_from_index_tensors(self, index_tensors_dict):
-        for tname in index_tensors_dict:
-            for step, itds in index_tensors_dict[tname].items():
-                for worker in itds:
-                    self._add_tensor(int(step), worker, itds[worker]["tensor_location"])
-
     def _read_collections(self, collection_files):
         first_collection_file = collection_files[0]  # First Collection File
         self.collection_manager = CollectionManager.load(first_collection_file)
diff --git a/smdebug/trials/s3_trial.py b/smdebug/trials/s3_trial.py
index c2751fb40..374ecdfad 100644
--- a/smdebug/trials/s3_trial.py
+++ b/smdebug/trials/s3_trial.py
@@ -56,12 +56,6 @@ def _get_collection_files(self) -> list:
         )
         return collection_files
 
-    def _load_tensors_from_index_tensors(self, index_tensors_dict):
-        for tname in index_tensors_dict:
-            for step, itds in index_tensors_dict[tname].items():
-                for worker in itds:
-                    self._add_tensor(int(step), worker, itds[worker]["tensor_location"])
-
     def _read_collections(self, collection_files):
         first_collection_file = collection_files[0]  # First Collection File
         key = os.path.join(first_collection_file)
diff --git a/smdebug/trials/trial.py b/smdebug/trials/trial.py
index 2d527305e..c1c8d9c84 100644
--- a/smdebug/trials/trial.py
+++ b/smdebug/trials/trial.py
@@ -14,7 +14,7 @@
     TRAINING_END_DELAY_REFRESH_DEFAULT,
     TRAINING_END_DELAY_REFRESH_KEY,
 )
-from smdebug.core.locations import IndexFileLocationUtils, TensorLocation
+from smdebug.core.locations import IndexFileLocationUtils, TensorLocation, TensorShape
 from smdebug.core.logger import get_logger
 from smdebug.core.modes import ModeKeys
 from smdebug.core.reductions import REDUCTIONS_PREFIX, reverse_reduction_tensor_name
@@ -231,14 +231,14 @@ def has_tensor(self, tname):
             self.maybe_refresh(tname)
         return tname in self._tensors
 
-    def _populate_step_dict(self, tensor_object, step_num):
-        if tensor_object.mode != ModeKeys.GLOBAL:
-            if tensor_object.mode not in self._mode_to_global:
-                self._mode_to_global[tensor_object.mode] = {}
-            if tensor_object.mode_step not in self._mode_to_global[tensor_object.mode]:
-                self._mode_to_global[tensor_object.mode][tensor_object.mode_step] = int(step_num)
+    def _populate_step_dict(self, mode, mode_step, step_num):
+        if mode != ModeKeys.GLOBAL:
+            if mode not in self._mode_to_global:
+                self._mode_to_global[mode] = {}
+            if mode_step not in self._mode_to_global[mode]:
+                self._mode_to_global[mode][mode_step] = int(step_num)
         if step_num not in self._global_to_mode:
-            self._global_to_mode[step_num] = (tensor_object.mode, tensor_object.mode_step)
+            self._global_to_mode[step_num] = (mode, mode_step)
 
     def _populate_workers_for_global_step(self, step, worker) -> None:
         """
@@ -263,7 +263,7 @@ def _populate_workers_for_global_step(self, step, worker) -> None:
             self.last_complete_step = step
             self.logger.debug(f"Populating last completing step to: {step}")
 
-    def _populate_global_step_to_tensor_name_map(self, tensor: TensorLocation, step_num) -> None:
+    def _populate_global_step_to_tensor_name_map(self, tensorname: str, step_num) -> None:
         """
         The self.global_step_to_tensors_map dictionary holds a mapping of
         step number and a set of all the tensor names that have been written for the step.
@@ -274,47 +274,67 @@ def _populate_global_step_to_tensor_name_map(self, tensor: TensorLocation, step_
         """
         if step_num not in self.global_step_to_tensors_map:
             self.global_step_to_tensors_map[step_num] = set()
-        self.global_step_to_tensors_map[step_num].add(tensor.tensorname)
+        self.global_step_to_tensors_map[step_num].add(tensorname)
 
-    def _populate_mode_to_tensor_name_map(self, tensor: TensorLocation) -> None:
+    def _populate_mode_to_tensor_name_map(self, tensorname, mode) -> None:
         """
         The self.mode_to_tensors_map dictionary holds a mapping of
         mode and a set of all the tensor names that have been written for the mode.
         :param tensor:
         :return:
         """
-        if tensor.mode != ModeKeys.GLOBAL:
-            if tensor.mode not in self.mode_to_tensors_map:
-                self.mode_to_tensors_map[tensor.mode] = set()
-            self.mode_to_tensors_map[tensor.mode].add(tensor.tensorname)
+        if mode != ModeKeys.GLOBAL:
+            if mode not in self.mode_to_tensors_map:
+                self.mode_to_tensors_map[mode] = set()
+            self.mode_to_tensors_map[mode].add(tensorname)
 
-    def _add_tensor(self, step_num, worker, tensor_object: TensorLocation):
+    def _load_tensors_from_index_tensors(self, index_tensors_dict):
+        for tname in index_tensors_dict:
+            for step, itds in index_tensors_dict[tname].items():
+                for worker in itds:
+                    self._add_tensor(
+                        int(step),
+                        worker,
+                        itds[worker].get("tensor_location", None),
+                        itds[worker].get("tensor_shape", None),
+                    )
+
+    def _add_tensor(
+        self, step_num, worker, tensor_location: TensorLocation, tensor_shape: TensorShape
+    ):
         is_reduction = False
 
-        if REDUCTIONS_PREFIX in tensor_object.tensorname:
-            tname, red_name, abs = reverse_reduction_tensor_name(tensor_object.tensorname)
-            tensor_object.tensorname = tname
-            is_reduction = True
+        if tensor_location is not None:
+            tensorname = tensor_location.tensorname
+            mode = tensor_location.mode
+            mode_step = tensor_location.mode_step
+        elif tensor_shape is not None:
+            tensorname = tensor_shape.name
+            mode = tensor_shape.mode
+            mode_step = tensor_shape.mode_step
         else:
-            tname = tensor_object.tensorname
+            raise RuntimeError("both tensor_location and tensor_shape can't be None")
 
-        if tname not in self._tensors:
-            tensor = Tensor(tname, trial=self, cache=self.cache)
-            self._tensors[tname] = tensor
+        if REDUCTIONS_PREFIX in tensorname:
+            tensorname, red_name, abs = reverse_reduction_tensor_name(tensorname)
+            is_reduction = True
 
-        tensor = self._tensors[tname]
+        if tensorname not in self._tensors:
+            tensor = Tensor(tensorname, trial=self, cache=self.cache)
+            self._tensors[tensorname] = tensor
+
+        tensor = self._tensors[tensorname]
 
         if is_reduction:
-            tensor.add_reduction_step(
-                tensor_object.mode, tensor_object.mode_step, worker, red_name, abs, tensor_object
-            )
+            tensor.add_reduction_step(mode, mode_step, worker, red_name, abs, tensor_location)
         else:
-            tensor.add_step(tensor_object.mode, tensor_object.mode_step, worker, tensor_object)
+            # shape can only be passed for actual tensor, not reductions
+            tensor.add_step(mode, mode_step, worker, tensor_location, tensor_shape)
 
-        self._populate_step_dict(tensor_object, step_num)
-        self._populate_global_step_to_tensor_name_map(tensor_object, step_num)
+        self._populate_step_dict(mode, mode_step, step_num)
+        self._populate_global_step_to_tensor_name_map(tensorname, step_num)
         self._populate_workers_for_global_step(step_num, worker)
-        self._populate_mode_to_tensor_name_map(tensor_object)
+        self._populate_mode_to_tensor_name_map(tensorname, mode)
 
     def _tensors_matching_regex(self, regex_list) -> set:
         matched_tensornames = set()
diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py
index 7390f335f..5f6f6796a 100644
--- a/tests/pytorch/test_reduce_config.py
+++ b/tests/pytorch/test_reduce_config.py
@@ -149,7 +149,7 @@ def forward(self, x):
     hook.register_module(model)
     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
     train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
-    verify_shapes(out_dir, 0, 4)
+    verify_shapes(out_dir, 0, 41)
     if hook_created:
         shutil.rmtree(out_dir)
 
diff --git a/tests/tensorflow/keras/test_keras.py b/tests/tensorflow/keras/test_keras.py
index 2fcb379f6..654d05418 100644
--- a/tests/tensorflow/keras/test_keras.py
+++ b/tests/tensorflow/keras/test_keras.py
@@ -5,6 +5,7 @@
 import pytest
 import tensorflow as tf
 from tests.tensorflow.utils import create_trial_fast_refresh
+from tests.utils import verify_shapes
 
 # First Party
 from smdebug.core.access_layer import has_training_ended
@@ -224,6 +225,20 @@ def test_tf_keras(out_dir):
     exhaustive_check(out_dir, True)
 
 
+@pytest.mark.slow  # 0:07 to run
+def test_tf_keras_shapes(out_dir):
+    train_model(
+        out_dir,
+        save_all=True,
+        reduction_config=ReductionConfig(save_shape=True),
+        use_tf_keras=True,
+        save_config=SaveConfig(save_steps=[0, 10]),
+        eager=False,
+        steps=["train", "eval", "predict", "train"],
+    )
+    verify_shapes(out_dir, 0, 21)
+
+
 @pytest.mark.slow  # 0:03 to run
 def test_tf_keras_non_keras_opt(out_dir):
     include_collections = [
diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index 6e9d932ec..349281d5c 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -191,8 +191,8 @@ def test_keras_gradtape_shapes(out_dir):
         reduction_config=ReductionConfig(save_shape=True),
     )
     helper_keras_gradtape(trial_dir=out_dir, hook=hook)
-    verify_shapes(out_dir, 0, 9)
-    verify_shapes(out_dir, 500, 14)
+    verify_shapes(out_dir, 0, 10)
+    verify_shapes(out_dir, 500, 15)
 
 
 @pytest.mark.skip_if_non_eager
@@ -473,7 +473,7 @@ def test_keras_fit_shapes(out_dir):
         reduction_config=ReductionConfig(save_shape=True),
     )
     helper_keras_fit(trial_dir=out_dir, hook=hook)
-    verify_shapes(out_dir, 0, 9)
+    verify_shapes(out_dir, 0, 12)
 
 
 @pytest.mark.slow
diff --git a/tests/utils.py b/tests/utils.py
index 4472cf7bc..298e77d38 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -16,8 +16,8 @@
     DEFAULT_WORKER_NAME,
     TENSORBOARD_CONFIG_FILE_PATH_ENV_STR,
 )
-from smdebug.core.locations import ShapeFileLocation
 from smdebug.core.utils import is_s3, remove_file_if_exists
+from smdebug.trials import create_trial
 
 
 def use_s3_datasets():
@@ -31,20 +31,12 @@ def use_s3_datasets():
 
 
 def verify_shapes(out_dir, step_num, num_tensors):
-    sl = ShapeFileLocation(step_num, DEFAULT_WORKER_NAME)
-    path = os.path.join(out_dir, sl.get_file_location())
-    with open(path) as jsfile:
-        shape_dict = json.load(jsfile)
-    print(shape_dict["payload"])
-    assert "payload" in shape_dict
-    assert len(shape_dict["payload"]) == num_tensors, (
-        len(shape_dict["payload"]),
-        shape_dict["payload"],
-    )
-    for ts in shape_dict["payload"]:
-        for dim in ts["shape"]:
-            assert isinstance(dim, int)
-        assert isinstance(ts["name"], str)
+    trial = create_trial(out_dir)
+    tnames = trial.tensor_names(step=step_num)
+    assert num_tensors == len(tnames), (len(tnames), tnames)
+    for tname in tnames:
+        tensor = trial.tensor(tname)
+        assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num))
 
 
 class SagemakerSimulator(object):

From 681e35c21b5cb4f2108e3d59dcb1d7024f55fb95 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Thu, 13 Aug 2020 18:33:14 -0700
Subject: [PATCH 12/31] Add mxnet test

---
 tests/mxnet/test_hook_reduce_config.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py
index 245a16a80..550f82a1a 100644
--- a/tests/mxnet/test_hook_reduce_config.py
+++ b/tests/mxnet/test_hook_reduce_config.py
@@ -2,6 +2,9 @@
 import shutil
 from datetime import datetime
 
+# Third Party
+from tests.utils import verify_shapes
+
 # First Party
 from smdebug.mxnet import ReductionConfig, SaveConfig
 from smdebug.mxnet.hook import Hook as t_hook
@@ -24,6 +27,7 @@ def test_save_config(hook=None, out_dir=None):
         hook = t_hook(
             out_dir=out_dir,
             save_config=global_save_config,
+            save_all=True,
             include_collections=[
                 "weights",
                 "biases",
@@ -82,6 +86,26 @@ def test_save_config(hook=None, out_dir=None):
         shutil.rmtree(out_dir)
 
 
+def test_save_shapes(out_dir, hook=None):
+    hook_created = False
+    if hook is None:
+        hook_created = True
+        global_reduce_config = ReductionConfig(save_shape=True)
+        global_save_config = SaveConfig(save_steps=[0, 1])
+
+        hook = t_hook(
+            out_dir=out_dir,
+            save_config=global_save_config,
+            save_all=True,
+            reduction_config=global_reduce_config,
+        )
+    run_mnist_gluon_model(hook=hook, num_steps_train=5)
+    verify_shapes(out_dir, 0, 49)
+    verify_shapes(out_dir, 1, 49)
+    if hook_created:
+        shutil.rmtree(out_dir)
+
+
 def test_save_config_hook_from_json():
     from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
     import os

From 5dc47ffa2da577f2066fa90808547ea78ad47c52 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Thu, 13 Aug 2020 18:59:57 -0700
Subject: [PATCH 13/31] Add s3 and json tests

---
 docs/api.md                                         |  1 +
 .../test_json_configs/test_hook_save_shape.json     |  9 +++++++++
 tests/pytorch/test_reduce_config.py                 | 13 +++++++++++++
 tests/tensorflow/hooks/test_estimator_modes.py      |  5 +++++
 4 files changed, 28 insertions(+)
 create mode 100644 tests/pytorch/test_json_configs/test_hook_save_shape.json

diff --git a/docs/api.md b/docs/api.md
index 778cf3e46..3490b14a5 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -96,6 +96,7 @@ include_workers
 include_regex
 reductions
 save_raw_tensor
+save_shape
 save_interval
 save_steps
 start_step
diff --git a/tests/pytorch/test_json_configs/test_hook_save_shape.json b/tests/pytorch/test_json_configs/test_hook_save_shape.json
new file mode 100644
index 000000000..166051af4
--- /dev/null
+++ b/tests/pytorch/test_json_configs/test_hook_save_shape.json
@@ -0,0 +1,9 @@
+{
+    "S3Path": "s3://kjndjknd_bucket/prefix",
+    "LocalPath": "/tmp/test_output/test_hook_save_shape/jsonloading",
+    "HookParameters": {
+      "save_all": true,
+      "save_shape": true,
+      "save_steps": "0,1"
+    }
+  }
diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py
index 5f6f6796a..0f6c26d74 100644
--- a/tests/pytorch/test_reduce_config.py
+++ b/tests/pytorch/test_reduce_config.py
@@ -154,6 +154,19 @@ def forward(self, x):
         shutil.rmtree(out_dir)
 
 
+def test_save_shapes_json():
+    from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
+
+    out_dir = "/tmp/test_output/test_hook_save_shape/jsonloading"
+    shutil.rmtree(out_dir, True)
+    os.environ[
+        CONFIG_FILE_PATH_ENV_STR
+    ] = "tests/pytorch/test_json_configs/test_hook_save_shape.json"
+    hook = t_hook.create_from_json_file()
+    test_save_shapes(hook=hook, out_dir=out_dir)
+    shutil.rmtree(out_dir, True)
+
+
 # Test creating hook by loading the json file with reduction configs.
 def test_reduce_config_with_json():
     from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR
diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py
index fe481ace4..c947c8ddf 100644
--- a/tests/tensorflow/hooks/test_estimator_modes.py
+++ b/tests/tensorflow/hooks/test_estimator_modes.py
@@ -212,6 +212,11 @@ def test_mnist_shapes(out_dir, on_s3=False):
     verify_shapes(out_dir, 0, 249)
 
 
+@pytest.mark.slow  # 0:02 to run
+def test_mnist_shapes_s3(out_dir):
+    test_mnist_shapes(out_dir, on_s3=True)
+
+
 @pytest.mark.slow  # 0:02 to run
 def test_mnist_local_json(out_dir, monkeypatch):
     monkeypatch.setenv(

From c775942c805b0fc1c3bccbe4945a08535142f8f5 Mon Sep 17 00:00:00 2001
From: NihalHarish <nihal42harish@gmail.com>
Date: Fri, 14 Aug 2020 15:14:50 -0700
Subject: [PATCH 14/31] lint

---
 smdebug/core/writer.py | 3 ---
 tests/utils.py         | 1 -
 2 files changed, 4 deletions(-)

diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py
index 4932e72e0..3ed1f6da5 100644
--- a/smdebug/core/writer.py
+++ b/smdebug/core/writer.py
@@ -17,11 +17,9 @@
 
 """APIs for logging data in the event file."""
 # Standard Library
-import json
 from typing import Tuple
 
 # First Party
-from smdebug.core.access_layer import TSAccessFile, TSAccessS3
 from smdebug.core.modes import MODE_PLUGIN_NAME, MODE_STEP_PLUGIN_NAME
 from smdebug.core.tfevent.event_file_writer import EventFileWriter
 from smdebug.core.tfevent.index_file_writer import IndexWriter
@@ -34,7 +32,6 @@
     scalar_summary,
 )
 from smdebug.core.tfevent.util import make_tensor_proto
-from smdebug.core.utils import is_s3
 
 # Local
 from .locations import (
diff --git a/tests/utils.py b/tests/utils.py
index 298e77d38..d087c93a7 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,5 +1,4 @@
 # Standard Library
-import json
 import os
 import shutil
 from pathlib import Path

From 355be0bf58587f5bce296bb6d2387806aa912f63 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 17 Aug 2020 12:00:43 -0700
Subject: [PATCH 15/31] Fix payload

---
 smdebug/core/index_reader.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py
index 58256dd65..5af1ef157 100644
--- a/smdebug/core/index_reader.py
+++ b/smdebug/core/index_reader.py
@@ -241,8 +241,9 @@ def _update_tensors_from_json(
         else:
             event_file_name = None
 
-        tensor_payload = index_dict["tensor_payload"]
         to_update_index_dict = []
+
+        tensor_payload = index_dict.get("tensor_payload", [])
         for tensor in tensor_payload:
             tensor_name = tensor["tensorname"]
             start_idx = tensor["start_idx"]
@@ -252,7 +253,7 @@ def _update_tensors_from_json(
             )
             to_update_index_dict.append((tensor_name, step, tensor_location))
 
-        shape_payload = index_dict["shape_payload"]
+        shape_payload = index_dict.get("shape_payload", [])
         for tensor in shape_payload:
             tensor_name = tensor["tensorname"]
             original_name = tensor["originalname"]

From 3eb0202ef545ab19e247e2df93ad53c392a227ff Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 17 Aug 2020 12:08:22 -0700
Subject: [PATCH 16/31] fix import

---
 tests/tensorflow/hooks/test_reductions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py
index b27589ff6..9803b3ac2 100644
--- a/tests/tensorflow/hooks/test_reductions.py
+++ b/tests/tensorflow/hooks/test_reductions.py
@@ -1,7 +1,7 @@
 # Standard Library
 
 # Third Party
-from tests.tensorflow2.utils import verify_shapes
+from tests.utils import verify_shapes
 
 # First Party
 import smdebug.tensorflow as smd

From c14a67ec57f88bb4e1336f874bf80e23fab69090 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 17 Aug 2020 12:41:44 -0700
Subject: [PATCH 17/31] Handle different num tensors for losses

---
 tests/pytorch/test_reduce_config.py | 3 ++-
 tests/utils.py                      | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py
index 0f6c26d74..c33baed6c 100644
--- a/tests/pytorch/test_reduce_config.py
+++ b/tests/pytorch/test_reduce_config.py
@@ -149,7 +149,8 @@ def forward(self, x):
     hook.register_module(model)
     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
     train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
-    verify_shapes(out_dir, 0, 41)
+    # different versions seem to output different tensors
+    verify_shapes(out_dir, 0, 41, exact_equal=False)
     if hook_created:
         shutil.rmtree(out_dir)
 
diff --git a/tests/utils.py b/tests/utils.py
index d087c93a7..65ed28ced 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -29,10 +29,13 @@ def use_s3_datasets():
         return False
 
 
-def verify_shapes(out_dir, step_num, num_tensors):
+def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True):
     trial = create_trial(out_dir)
     tnames = trial.tensor_names(step=step_num)
-    assert num_tensors == len(tnames), (len(tnames), tnames)
+    if exact_equal:
+        assert num_tensors == len(tnames), (len(tnames), tnames)
+    else:
+        assert num_tensors >= len(tnames), (len(tnames), tnames)
     for tname in tnames:
         tensor = trial.tensor(tname)
         assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num))

From d12b824bbe7970847978fbe1b510c9011225671e Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 17 Aug 2020 12:56:36 -0700
Subject: [PATCH 18/31] Fix exact equal condition

---
 tests/pytorch/test_reduce_config.py | 2 +-
 tests/utils.py                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py
index c33baed6c..f3d7b1214 100644
--- a/tests/pytorch/test_reduce_config.py
+++ b/tests/pytorch/test_reduce_config.py
@@ -149,7 +149,7 @@ def forward(self, x):
     hook.register_module(model)
     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
     train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
-    # different versions seem to output different tensors
+    # different versions seem to output different number of loss tensors
     verify_shapes(out_dir, 0, 41, exact_equal=False)
     if hook_created:
         shutil.rmtree(out_dir)
diff --git a/tests/utils.py b/tests/utils.py
index 65ed28ced..f8bc07159 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -35,7 +35,7 @@ def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True):
     if exact_equal:
         assert num_tensors == len(tnames), (len(tnames), tnames)
     else:
-        assert num_tensors >= len(tnames), (len(tnames), tnames)
+        assert num_tensors <= len(tnames), (len(tnames), tnames)
     for tname in tnames:
         tensor = trial.tensor(tname)
         assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num))

From 972d95a71e63b7c0620d10f86f04dd3dc9271546 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Mon, 17 Aug 2020 16:27:50 -0700
Subject: [PATCH 19/31] Fix mode bug

---
 smdebug/core/locations.py                 | 3 +++
 tests/tensorflow/hooks/test_reductions.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py
index 9712f58a2..de99f72de 100644
--- a/smdebug/core/locations.py
+++ b/smdebug/core/locations.py
@@ -23,6 +23,9 @@ def __init__(self, tname, mode, mode_step, event_file_name, start_idx, length, w
     def to_dict(self):
         return {"tensorname": self.tensorname, "start_idx": self.start_idx, "length": self.length}
 
+    def get_mode(self):
+        return str(self.mode).split(".")[-1]
+
 
 class TensorShape:
     def __init__(self, name, mode, mode_step, shape, original_name=None):
diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py
index 9803b3ac2..1b857fecc 100644
--- a/tests/tensorflow/hooks/test_reductions.py
+++ b/tests/tensorflow/hooks/test_reductions.py
@@ -70,7 +70,7 @@ def test_shapes(out_dir, save_raw_tensor=False):
         include_collections=["weights", "gradients", "losses"],
     )
     simple_model(hook)
-    verify_shapes(out_dir, 0, 2)
+    verify_shapes(out_dir, 0, 3)
 
 
 def test_reductions_with_raw_tensor(out_dir):

From 850cc4446e98e40eca525fad128c2d643ff14d3a Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Tue, 18 Aug 2020 11:04:22 -0700
Subject: [PATCH 20/31] trigger CI


From 2c4479697fb27a5ae2dbb25f6d38f9b124874f66 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Tue, 18 Aug 2020 22:10:17 -0700
Subject: [PATCH 21/31] Add support for distributed training with writer map

---
 smdebug/core/hook.py                     | 48 ++++++++++++++--------
 smdebug/core/locations.py                |  3 ++
 smdebug/tensorflow/base_hook.py          | 52 ++++++++++--------------
 tests/tensorflow2/test_keras_mirrored.py | 14 +++++++
 tests/utils.py                           | 14 ++++++-
 5 files changed, 81 insertions(+), 50 deletions(-)

diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py
index 5256c9adf..345ee5317 100644
--- a/smdebug/core/hook.py
+++ b/smdebug/core/hook.py
@@ -410,6 +410,17 @@ def _prepare_collections(self):
         self.prepared_collections = True
 
     #### End of Save Manager methods ####
+    @staticmethod
+    def _close_given_writer_map(writer_dict):
+        # Delete all the dist training writers
+        to_delete_writers = []
+        for key, writer in writer_dict.items():
+            # close calls flush
+            writer.close()
+            to_delete_writers.append(key)
+
+        for key in to_delete_writers:
+            del writer_dict[key]
 
     def _close_writers(self) -> None:
         if self.dry_run:
@@ -423,15 +434,7 @@ def _close_writers(self) -> None:
             self.writer.close()
             self.writer = None
 
-        to_delete_writers = []
-        # Delete all the tb writers
-        for mode, writer in self.tb_writers.items():
-            if writer is not None:
-                writer.flush()
-                writer.close()
-                to_delete_writers.append(mode)
-        for mode in to_delete_writers:
-            del self.tb_writers[mode]
+        self._close_given_writer_map(self.tb_writers)
 
         if self.shape_writer is not None:
             self.shape_writer.close()
@@ -474,7 +477,13 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
                 index_writer=self.writer.index_writer,
             )
 
-    def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
+    def _get_single_process_writers(self, shape_writers=False) -> List[FileWriter]:
+        if shape_writers is False:
+            return [self.writer] if self.writer else []
+        else:
+            return [self.shape_writer] if self.shape_writer else []
+
+    def _get_writers(self, tensor_name, tensor_ref=None, shape_writers=False) -> List[FileWriter]:
         """
         :param tensor_name:
         :param tensor_ref: used by TF
@@ -482,7 +491,7 @@ def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
         """
         if self.save_all_workers is False and self.worker != self.chief_worker:
             return []
-        return [self.writer] if self.writer else []
+        return self._get_single_process_writers(shape_writers)
 
     def _maybe_get_tb_writer(self) -> Optional[FileWriter]:
         """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None.
@@ -745,6 +754,7 @@ def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_
                 break
 
     def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None):
+        shape_writers = self._get_writers(tensor_name, tensor_ref=tensor_ref, shape_writers=True)
         for s_col in save_collections:
             reduction_config = s_col.reduction_config
             if self.dry_run is False and reduction_config.save_shape is True:
@@ -754,13 +764,15 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N
                     original_name = tensor_ref.tf_obj.name
                 else:
                     original_name = None
-                self.shape_writer.write_shape(
-                    tensor_name,
-                    this_shape,
-                    self.mode,
-                    self.mode_steps[self.mode],
-                    original_name=original_name,
-                )
+
+                for writer in shape_writers:
+                    writer.write_shape(
+                        tensor_name,
+                        this_shape,
+                        self.mode,
+                        self.mode_steps[self.mode],
+                        original_name=original_name,
+                    )
                 break
 
     def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None):
diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py
index de99f72de..03c389224 100644
--- a/smdebug/core/locations.py
+++ b/smdebug/core/locations.py
@@ -40,6 +40,9 @@ def __init__(self, name, mode, mode_step, shape, original_name=None):
     def to_dict(self):
         return {"tensorname": self.name, "originalname": self.original_name, "shape": self.shape}
 
+    def get_mode(self):
+        return str(self.mode).split(".")[-1]
+
 
 STEP_NUMBER_FORMATTING_LENGTH = "012"
 
diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py
index 02c69b36f..ad6557ade 100644
--- a/smdebug/tensorflow/base_hook.py
+++ b/smdebug/tensorflow/base_hook.py
@@ -262,7 +262,7 @@ def _set_chief_worker(self):
         elif self.distribution_strategy == TFDistributionStrategy.UNSUPPORTED:
             raise NotImplementedError
 
-    def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]:
+    def _get_writers(self, tensor_name, tensor_ref, shape_writers=False) -> List[FileWriter]:
         """
         For tensors generated during distributed tf jobs, we map the tensor to a writer
         with its device attribute.
@@ -278,8 +278,8 @@ def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]:
             TFDistributionStrategy.PARAMETER_SERVER,
             TFDistributionStrategy.HOROVOD,
         ]:
-            if (self.save_all_workers is True or self.worker == self.chief_worker) and self.writer:
-                return [self.writer]
+            if self.save_all_workers is True or self.worker == self.chief_worker:
+                return self._get_single_process_writers(shape_writers)
         elif self.distribution_strategy == TFDistributionStrategy.MIRRORED:
             if len(self.device_map):
                 # else is for metrics in Keras
@@ -290,17 +290,25 @@ def _get_writers(self, tensor_name, tensor_ref) -> List[FileWriter]:
                 # if device str is empty or cpu in worker
                 if not bool(worker) or "CPU" in worker:
                     if self.save_all_workers:
-                        return list(self.writer_map.values())
+                        if shape_writers is False:
+                            return list(self.writer_map.values())
+                        else:
+                            return list(self.shape_writer_map.values())
                     else:
-                        return [self.writer_map[self.device_map[self.chief_worker]]]
+                        if shape_writers is False:
+                            return [self.writer_map[self.device_map[self.chief_worker]]]
+                        else:
+                            return [self.shape_writer_map[self.device_map[self.chief_worker]]]
                 elif self.save_all_workers or worker == self.chief_worker:
-                    return [self.writer_map[self.device_map[worker]]]
-            elif self.writer:
+                    if shape_writers is False:
+                        return [self.writer_map[self.device_map[worker]]]
+                    else:
+                        return [self.shape_writer_map[self.device_map[worker]]]
+            else:
                 # training on CPU when all device strings have cpu
-                return [self.writer]
+                return self._get_single_process_writers(shape_writers)
         elif self.distribution_strategy == TFDistributionStrategy.NONE:
-            if self.writer:
-                return [self.writer]
+            return self._get_single_process_writers(shape_writers)
         else:
             raise NotImplementedError
         # when self.writer is None, returns empty list
@@ -338,7 +346,7 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
                             trial_dir=self.out_dir, step=self.step, worker=device_string
                         )
                         if self._saving_shapes_in_step():
-                            self.shape_writer[device_string] = ShapeWriter(
+                            self.shape_writer_map[device_string] = ShapeWriter(
                                 trial_dir=self.out_dir,
                                 step=self.step,
                                 worker=self.worker,
@@ -383,25 +391,9 @@ def _close_writers(self) -> None:
             self.writer.close()
             self.writer = None
 
-        # Delete all the dist training writers
-        to_delete_writers = []
-        for device, writer in self.writer_map.items():
-            writer.flush()
-            writer.close()
-            to_delete_writers.append(device)
-
-        for device in to_delete_writers:
-            del self.writer_map[device]
-
-        to_delete_writers = []
-        # Delete all the tb writers
-        for mode, writer in self.tb_writers.items():
-            if writer is not None:
-                writer.flush()
-                writer.close()
-                to_delete_writers.append(mode)
-        for mode in to_delete_writers:
-            del self.tb_writers[mode]
+        self._close_given_writer_map(self.writer_map)
+        self._close_given_writer_map(self.shape_writer_map)
+        self._close_given_writer_map(self.tb_writers)
 
         if self.shape_writer is not None:
             self.shape_writer.close()
diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py
index d857218ab..c68bffdff 100644
--- a/tests/tensorflow2/test_keras_mirrored.py
+++ b/tests/tensorflow2/test_keras_mirrored.py
@@ -13,6 +13,7 @@
 from tests.core.utils import verify_files
 from tests.tensorflow2.utils import is_tf_2_2, is_tf_2_3
 from tests.tensorflow.utils import create_trial_fast_refresh
+from tests.utils import verify_shapes
 
 # First Party
 import smdebug.tensorflow as smd
@@ -290,6 +291,19 @@ def test_save_all(out_dir, tf_eager_mode, workers):
     verify_files(out_dir, save_config, saved_scalars)
 
 
+@pytest.mark.slow
+def test_shapes(out_dir, tf_eager_mode):
+    train_model(
+        out_dir,
+        save_all=True,
+        save_config=SaveConfig(save_steps=[0]),
+        reduction_config=ReductionConfig(save_shape=True),
+        steps=["train"],
+        eager=tf_eager_mode,
+    )
+    verify_shapes(out_dir, 0, 15, multiworker=True)
+
+
 @pytest.mark.slow
 def test_base_reductions(out_dir, tf_eager_mode):
     train_model(
diff --git a/tests/utils.py b/tests/utils.py
index f8bc07159..ab3fb703b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -29,7 +29,7 @@ def use_s3_datasets():
         return False
 
 
-def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True):
+def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True, multiworker=False):
     trial = create_trial(out_dir)
     tnames = trial.tensor_names(step=step_num)
     if exact_equal:
@@ -38,7 +38,17 @@ def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True):
         assert num_tensors <= len(tnames), (len(tnames), tnames)
     for tname in tnames:
         tensor = trial.tensor(tname)
-        assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num))
+        if multiworker is False:
+            assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num))
+        else:
+            workers = tensor.workers(step_num)
+            assert len(workers) > 1
+            for w in workers:
+                assert isinstance(tensor.shape(step_num, worker=w), tuple), (
+                    tname,
+                    w,
+                    tensor.shape(step_num, worker=w),
+                )
 
 
 class SagemakerSimulator(object):

From 1b09b8e40dcea0a6a6052941d2c018ed5e591a56 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Tue, 18 Aug 2020 22:48:44 -0700
Subject: [PATCH 22/31] Check that value throws exception

---
 smdebug/core/index_reader.py | 81 +++++++++++++-----------------------
 smdebug/core/tensor.py       |  1 +
 tests/utils.py               | 25 +++++++++++
 3 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py
index 5af1ef157..525d8608e 100644
--- a/smdebug/core/index_reader.py
+++ b/smdebug/core/index_reader.py
@@ -120,12 +120,22 @@ def fetch_tensor_value(self, tensor_location: TensorLocation):
     def list_event_files(self, start_after_prefix):
         pass
 
-    @abstractmethod
     def load_tensor_data_from_index_files(
         self, start_after_key=None, range_steps=None
     ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]:
         """Return a triply nested dict referring to tensor data."""
 
+        responses, steps, last_index_token, workers = self.read_index_files(
+            start_after_key, range_steps
+        )
+
+        tensor_data = {}
+        for step, response, worker in zip(steps, responses, workers):
+            tensor_data = self._update_tensors_from_json(
+                tensor_data, step, response, self.path, worker
+            )
+        return tensor_data, last_index_token
+
     @abstractmethod
     def _is_event_file_present(self, file_name) -> bool:
         pass
@@ -236,30 +246,26 @@ def _update_tensors_from_json(
         mode = ModeKeys[mode.strip()]
         mode_step = index_meta["mode_step"]
 
-        if "event_file_name" in index_meta:
-            event_file_name = os.path.join(path, index_meta["event_file_name"])
-        else:
-            event_file_name = None
-
         to_update_index_dict = []
 
-        tensor_payload = index_dict.get("tensor_payload", [])
-        for tensor in tensor_payload:
-            tensor_name = tensor["tensorname"]
-            start_idx = tensor["start_idx"]
-            length = tensor["length"]
-            tensor_location = TensorLocation(
-                tensor_name, mode, mode_step, event_file_name, start_idx, length, worker
-            )
-            to_update_index_dict.append((tensor_name, step, tensor_location))
+        if "tensor_payload" in index_dict:
+            event_file_name = os.path.join(path, index_meta["event_file_name"])
+            for tensor in index_dict["tensor_payload"]:
+                tensor_name = tensor["tensorname"]
+                start_idx = tensor["start_idx"]
+                length = tensor["length"]
+                tensor_location = TensorLocation(
+                    tensor_name, mode, mode_step, event_file_name, start_idx, length, worker
+                )
+                to_update_index_dict.append((tensor_name, step, tensor_location))
 
-        shape_payload = index_dict.get("shape_payload", [])
-        for tensor in shape_payload:
-            tensor_name = tensor["tensorname"]
-            original_name = tensor["originalname"]
-            shape = tensor["shape"]
-            ts = TensorShape(tensor_name, mode, mode_step, shape, original_name)
-            to_update_index_dict.append((tensor_name, step, ts))
+        if "shape_payload" in index_dict:
+            for tensor in index_dict["shape_payload"]:
+                tensor_name = tensor["tensorname"]
+                original_name = tensor["originalname"]
+                shape = tensor["shape"]
+                ts = TensorShape(tensor_name, mode, mode_step, shape, original_name)
+                to_update_index_dict.append((tensor_name, step, ts))
 
         for tu in to_update_index_dict:
             tensor_name, step, obj = tu
@@ -304,22 +310,6 @@ def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray:
         tensor_name, step, tensor_data, mode, mode_step = tensor_tuple
         return tensor_data
 
-    def load_tensor_data_from_index_files(
-        self, start_after_key=None, range_steps=None
-    ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]:
-        """Return a triply nested dict referring to tensor data."""
-
-        responses, steps, last_index_token, workers = self.read_index_files(
-            start_after_key, range_steps
-        )
-
-        tensor_data = {}
-        for step, response, worker in zip(steps, responses, workers):
-            tensor_data = self._update_tensors_from_json(
-                tensor_data, step, response, self.path, worker
-            )
-        return tensor_data, last_index_token
-
     def read_index_files(
         self, start_after_key: str, range_steps=None
     ) -> Tuple[List[bytes], list, str, List[str]]:
@@ -417,21 +407,6 @@ def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray:
         tensor_name, step, tensor_data, mode, mode_step = tensor_tuple
         return tensor_data
 
-    def load_tensor_data_from_index_files(
-        self, start_after_key=None, range_steps=None
-    ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]:
-        """Return a triply nested dict referring to tensor data."""
-
-        responses, steps, last_index_token, workers = self.read_index_files(
-            start_after_key, range_steps
-        )
-        tensor_data = {}
-        for step, response, worker in zip(steps, responses, workers):
-            tensor_data = self._update_tensors_from_json(
-                tensor_data, step, response, self.path, worker
-            )
-        return tensor_data, last_index_token
-
     def read_index_files(
         self, start_after_key: str, range_steps=None
     ) -> Tuple[List[bytes], list, str, List[str]]:
diff --git a/smdebug/core/tensor.py b/smdebug/core/tensor.py
index de52f7858..db2098c3f 100644
--- a/smdebug/core/tensor.py
+++ b/smdebug/core/tensor.py
@@ -266,6 +266,7 @@ def values(self, mode=ModeKeys.GLOBAL, worker=None):
     def value(self, step_num, mode=ModeKeys.GLOBAL, worker=None):
         # step refreshes
         s = self._step(step_num=step_num, mode=mode, worker=worker)
+        print(s, s.value)
         if s.value is not None:
             return s.value
         elif s.location is not None:
diff --git a/tests/utils.py b/tests/utils.py
index ab3fb703b..555dc8719 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -5,6 +5,7 @@
 
 # Third Party
 import boto3
+import numpy as np
 from tests.constants import TEST_DATASET_S3_PATH
 
 # First Party
@@ -16,6 +17,7 @@
     TENSORBOARD_CONFIG_FILE_PATH_ENV_STR,
 )
 from smdebug.core.utils import is_s3, remove_file_if_exists
+from smdebug.exceptions import TensorUnavailableForStep
 from smdebug.trials import create_trial
 
 
@@ -29,6 +31,15 @@ def use_s3_datasets():
         return False
 
 
+def is_scalar(x):
+    if isinstance(x, list):
+        if len(x) == 1:
+            return True
+    elif isinstance(x, np.ndarray):
+        return True
+    return False
+
+
 def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True, multiworker=False):
     trial = create_trial(out_dir)
     tnames = trial.tensor_names(step=step_num)
@@ -40,10 +51,24 @@ def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True, multiworker=
         tensor = trial.tensor(tname)
         if multiworker is False:
             assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num))
+            try:
+                if not is_scalar(tensor.value(step_num)):
+                    # test did not save value except scalars which dont use reduction config
+                    #  so it should raise the below exception
+                    assert False
+            except TensorUnavailableForStep:
+                pass
         else:
             workers = tensor.workers(step_num)
             assert len(workers) > 1
             for w in workers:
+                try:
+                    if not is_scalar(tensor.value(step_num, worker=w)):
+                        # test did not save value so it should raise the below exception
+                        assert False
+                except TensorUnavailableForStep:
+                    pass
+
                 assert isinstance(tensor.shape(step_num, worker=w), tuple), (
                     tname,
                     w,

From f4106f33a2a6c3c81dda817a4f4759aee4c2f127 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 19 Aug 2020 13:00:06 -0700
Subject: [PATCH 23/31] Fix tests to make them more resilient

---
 smdebug/core/tensor.py                         |  1 -
 tests/mxnet/test_hook_reduce_config.py         |  4 ++--
 tests/tensorflow/hooks/test_estimator_modes.py | 11 ++++++++++-
 tests/tensorflow/hooks/test_reductions.py      |  6 +++++-
 tests/tensorflow/keras/test_keras.py           |  4 +++-
 tests/tensorflow2/test_keras.py                |  7 ++++---
 tests/tensorflow2/test_keras_mirrored.py       | 12 +++++++++++-
 tests/utils.py                                 | 13 ++++---------
 8 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/smdebug/core/tensor.py b/smdebug/core/tensor.py
index db2098c3f..de52f7858 100644
--- a/smdebug/core/tensor.py
+++ b/smdebug/core/tensor.py
@@ -266,7 +266,6 @@ def values(self, mode=ModeKeys.GLOBAL, worker=None):
     def value(self, step_num, mode=ModeKeys.GLOBAL, worker=None):
         # step refreshes
         s = self._step(step_num=step_num, mode=mode, worker=worker)
-        print(s, s.value)
         if s.value is not None:
             return s.value
         elif s.location is not None:
diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py
index 550f82a1a..898776d87 100644
--- a/tests/mxnet/test_hook_reduce_config.py
+++ b/tests/mxnet/test_hook_reduce_config.py
@@ -100,8 +100,8 @@ def test_save_shapes(out_dir, hook=None):
             reduction_config=global_reduce_config,
         )
     run_mnist_gluon_model(hook=hook, num_steps_train=5)
-    verify_shapes(out_dir, 0, 49)
-    verify_shapes(out_dir, 1, 49)
+    verify_shapes(out_dir, 0, 40, exact_equal=False)
+    verify_shapes(out_dir, 1, 40, exact_equal=False)
     if hook_created:
         shutil.rmtree(out_dir)
 
diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py
index c947c8ddf..62a4f114e 100644
--- a/tests/tensorflow/hooks/test_estimator_modes.py
+++ b/tests/tensorflow/hooks/test_estimator_modes.py
@@ -18,6 +18,7 @@
 import pytest
 import tensorflow as tf
 from tests.analysis.utils import delete_s3_prefix
+from tests.tensorflow.utils import create_trial_fast_refresh
 from tests.utils import verify_shapes
 
 # First Party
@@ -209,7 +210,15 @@ def test_mnist_shapes(out_dir, on_s3=False):
         steps=None,
         reduction_config=smd.ReductionConfig(save_shape=True),
     )
-    verify_shapes(out_dir, 0, 249)
+    verify_shapes(
+        out_dir,
+        0,
+        [
+            "conv2d/kernel:0",
+            "gradients/sparse_softmax_cross_entropy_loss/value_grad/Sum:0",
+            "dense_1/kernel:0",
+        ],
+    )
 
 
 @pytest.mark.slow  # 0:02 to run
diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py
index 1b857fecc..3cb18624c 100644
--- a/tests/tensorflow/hooks/test_reductions.py
+++ b/tests/tensorflow/hooks/test_reductions.py
@@ -70,7 +70,11 @@ def test_shapes(out_dir, save_raw_tensor=False):
         include_collections=["weights", "gradients", "losses"],
     )
     simple_model(hook)
-    verify_shapes(out_dir, 0, 3)
+    verify_shapes(
+        out_dir,
+        0,
+        ["foobar/weight1:0", "gradients/MatMul_grad/tuple/control_dependency_1:0", "loss:0"],
+    )
 
 
 def test_reductions_with_raw_tensor(out_dir):
diff --git a/tests/tensorflow/keras/test_keras.py b/tests/tensorflow/keras/test_keras.py
index 654d05418..a122722be 100644
--- a/tests/tensorflow/keras/test_keras.py
+++ b/tests/tensorflow/keras/test_keras.py
@@ -236,7 +236,9 @@ def test_tf_keras_shapes(out_dir):
         eager=False,
         steps=["train", "eval", "predict", "train"],
     )
-    verify_shapes(out_dir, 0, 21)
+    verify_shapes(
+        out_dir, 0, ["training/RMSprop/momentum:0", "dense/weights/dense/kernel:0", "batch", "loss"]
+    )
 
 
 @pytest.mark.slow  # 0:03 to run
diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index 349281d5c..cc676eb20 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -191,8 +191,8 @@ def test_keras_gradtape_shapes(out_dir):
         reduction_config=ReductionConfig(save_shape=True),
     )
     helper_keras_gradtape(trial_dir=out_dir, hook=hook)
-    verify_shapes(out_dir, 0, 10)
-    verify_shapes(out_dir, 500, 15)
+    verify_shapes(out_dir, 0, ["gradients/dense_1/biasGrad", "weights/dense/bias:0", "loss"])
+    verify_shapes(out_dir, 500, ["weights/dense/bias:0", "Adam/learning_rate:0", "loss"])
 
 
 @pytest.mark.skip_if_non_eager
@@ -473,7 +473,8 @@ def test_keras_fit_shapes(out_dir):
         reduction_config=ReductionConfig(save_shape=True),
     )
     helper_keras_fit(trial_dir=out_dir, hook=hook)
-    verify_shapes(out_dir, 0, 12)
+    print(create_trial_fast_refresh(out_dir).tensor_names(step=0))
+    verify_shapes(out_dir, 0, ["dense/weights/dense/kernel:0", "accuracy", "Adam/beta_1:0"])
 
 
 @pytest.mark.slow
diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py
index c68bffdff..c49b65e3d 100644
--- a/tests/tensorflow2/test_keras_mirrored.py
+++ b/tests/tensorflow2/test_keras_mirrored.py
@@ -301,7 +301,17 @@ def test_shapes(out_dir, tf_eager_mode):
         steps=["train"],
         eager=tf_eager_mode,
     )
-    verify_shapes(out_dir, 0, 15, multiworker=True)
+    verify_shapes(
+        out_dir,
+        0,
+        [
+            "dense_1/weights/dense_1/kernel:0",
+            "scalar/foobar",
+            "dense/weights/dense/bias:0",
+            "Adam/decay:0",
+        ],
+        multiworker=True,
+    )
 
 
 @pytest.mark.slow
diff --git a/tests/utils.py b/tests/utils.py
index 555dc8719..98e1091f1 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -7,6 +7,7 @@
 import boto3
 import numpy as np
 from tests.constants import TEST_DATASET_S3_PATH
+from tests.tensorflow.utils import create_trial_fast_refresh
 
 # First Party
 from smdebug.core.config_constants import (
@@ -18,7 +19,6 @@
 )
 from smdebug.core.utils import is_s3, remove_file_if_exists
 from smdebug.exceptions import TensorUnavailableForStep
-from smdebug.trials import create_trial
 
 
 def use_s3_datasets():
@@ -40,14 +40,9 @@ def is_scalar(x):
     return False
 
 
-def verify_shapes(out_dir, step_num, num_tensors, exact_equal=True, multiworker=False):
-    trial = create_trial(out_dir)
-    tnames = trial.tensor_names(step=step_num)
-    if exact_equal:
-        assert num_tensors == len(tnames), (len(tnames), tnames)
-    else:
-        assert num_tensors <= len(tnames), (len(tnames), tnames)
-    for tname in tnames:
+def verify_shapes(out_dir, step_num, tensornames, multiworker=False):
+    trial = create_trial_fast_refresh(out_dir)
+    for tname in tensornames:
         tensor = trial.tensor(tname)
         if multiworker is False:
             assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num))

From 78b67d615b3a5604f6d09d4a12d7524dbba3d450 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 19 Aug 2020 13:05:57 -0700
Subject: [PATCH 24/31] Fix mxnet and pytorch tests

---
 tests/mxnet/test_hook_reduce_config.py | 26 ++++++++++++++++++++++++--
 tests/pytorch/test_reduce_config.py    |  4 +++-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py
index 898776d87..3f8aa11d1 100644
--- a/tests/mxnet/test_hook_reduce_config.py
+++ b/tests/mxnet/test_hook_reduce_config.py
@@ -100,8 +100,30 @@ def test_save_shapes(out_dir, hook=None):
             reduction_config=global_reduce_config,
         )
     run_mnist_gluon_model(hook=hook, num_steps_train=5)
-    verify_shapes(out_dir, 0, 40, exact_equal=False)
-    verify_shapes(out_dir, 1, 40, exact_equal=False)
+    verify_shapes(
+        out_dir,
+        0,
+        [
+            "dense0_relu_input_0",
+            "dense0_relu_output_0",
+            "pool1_output_0",
+            "gradient/dense0_weight",
+            "conv0_weight",
+            "softmaxcrossentropyloss0_input_0",
+        ],
+    )
+    verify_shapes(
+        out_dir,
+        1,
+        [
+            "dense0_relu_input_0",
+            "dense0_relu_output_0",
+            "pool1_output_0",
+            "gradient/dense0_weight",
+            "conv0_weight",
+            "softmaxcrossentropyloss0_input_0",
+        ],
+    )
     if hook_created:
         shutil.rmtree(out_dir)
 
diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py
index f3d7b1214..633770cf6 100644
--- a/tests/pytorch/test_reduce_config.py
+++ b/tests/pytorch/test_reduce_config.py
@@ -150,7 +150,9 @@ def forward(self, x):
     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
     train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
     # different versions seem to output different number of loss tensors
-    verify_shapes(out_dir, 0, 41, exact_equal=False)
+    verify_shapes(
+        out_dir, 0, ["conv2_input_0", "NestedNet_fc1.bias", "gradient/NestedNet_conv2.weight"]
+    )
     if hook_created:
         shutil.rmtree(out_dir)
 

From 2515a2db58b7082e764991ce9725074d68d7cf3d Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 19 Aug 2020 13:15:28 -0700
Subject: [PATCH 25/31] Remove tensor names

---
 tests/mxnet/test_hook_reduce_config.py        | 26 ++-----------------
 tests/pytorch/test_reduce_config.py           |  4 +--
 .../tensorflow/hooks/test_estimator_modes.py  | 10 +------
 tests/tensorflow/hooks/test_reductions.py     |  6 +----
 tests/tensorflow/keras/test_keras.py          |  4 +--
 tests/tensorflow2/test_keras.py               |  4 +--
 tests/tensorflow2/test_keras_mirrored.py      | 12 +--------
 tests/utils.py                                |  4 +--
 8 files changed, 11 insertions(+), 59 deletions(-)

diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py
index 3f8aa11d1..1ef2be4a5 100644
--- a/tests/mxnet/test_hook_reduce_config.py
+++ b/tests/mxnet/test_hook_reduce_config.py
@@ -100,30 +100,8 @@ def test_save_shapes(out_dir, hook=None):
             reduction_config=global_reduce_config,
         )
     run_mnist_gluon_model(hook=hook, num_steps_train=5)
-    verify_shapes(
-        out_dir,
-        0,
-        [
-            "dense0_relu_input_0",
-            "dense0_relu_output_0",
-            "pool1_output_0",
-            "gradient/dense0_weight",
-            "conv0_weight",
-            "softmaxcrossentropyloss0_input_0",
-        ],
-    )
-    verify_shapes(
-        out_dir,
-        1,
-        [
-            "dense0_relu_input_0",
-            "dense0_relu_output_0",
-            "pool1_output_0",
-            "gradient/dense0_weight",
-            "conv0_weight",
-            "softmaxcrossentropyloss0_input_0",
-        ],
-    )
+    verify_shapes(out_dir, 0)
+    verify_shapes(out_dir, 1)
     if hook_created:
         shutil.rmtree(out_dir)
 
diff --git a/tests/pytorch/test_reduce_config.py b/tests/pytorch/test_reduce_config.py
index 633770cf6..de97b30da 100644
--- a/tests/pytorch/test_reduce_config.py
+++ b/tests/pytorch/test_reduce_config.py
@@ -150,9 +150,7 @@ def forward(self, x):
     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
     train(model, hook, torch.device("cpu"), optimizer, num_steps=10)
     # different versions seem to output different number of loss tensors
-    verify_shapes(
-        out_dir, 0, ["conv2_input_0", "NestedNet_fc1.bias", "gradient/NestedNet_conv2.weight"]
-    )
+    verify_shapes(out_dir, 0)
     if hook_created:
         shutil.rmtree(out_dir)
 
diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py
index 62a4f114e..b07ab1914 100644
--- a/tests/tensorflow/hooks/test_estimator_modes.py
+++ b/tests/tensorflow/hooks/test_estimator_modes.py
@@ -210,15 +210,7 @@ def test_mnist_shapes(out_dir, on_s3=False):
         steps=None,
         reduction_config=smd.ReductionConfig(save_shape=True),
     )
-    verify_shapes(
-        out_dir,
-        0,
-        [
-            "conv2d/kernel:0",
-            "gradients/sparse_softmax_cross_entropy_loss/value_grad/Sum:0",
-            "dense_1/kernel:0",
-        ],
-    )
+    verify_shapes(out_dir, 0)
 
 
 @pytest.mark.slow  # 0:02 to run
diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py
index 3cb18624c..e009f4565 100644
--- a/tests/tensorflow/hooks/test_reductions.py
+++ b/tests/tensorflow/hooks/test_reductions.py
@@ -70,11 +70,7 @@ def test_shapes(out_dir, save_raw_tensor=False):
         include_collections=["weights", "gradients", "losses"],
     )
     simple_model(hook)
-    verify_shapes(
-        out_dir,
-        0,
-        ["foobar/weight1:0", "gradients/MatMul_grad/tuple/control_dependency_1:0", "loss:0"],
-    )
+    verify_shapes(out_dir, 0)
 
 
 def test_reductions_with_raw_tensor(out_dir):
diff --git a/tests/tensorflow/keras/test_keras.py b/tests/tensorflow/keras/test_keras.py
index a122722be..bfd5e7cc7 100644
--- a/tests/tensorflow/keras/test_keras.py
+++ b/tests/tensorflow/keras/test_keras.py
@@ -236,9 +236,7 @@ def test_tf_keras_shapes(out_dir):
         eager=False,
         steps=["train", "eval", "predict", "train"],
     )
-    verify_shapes(
-        out_dir, 0, ["training/RMSprop/momentum:0", "dense/weights/dense/kernel:0", "batch", "loss"]
-    )
+    verify_shapes(out_dir, 0)
 
 
 @pytest.mark.slow  # 0:03 to run
diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index cc676eb20..ffa5c9ae8 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -191,8 +191,8 @@ def test_keras_gradtape_shapes(out_dir):
         reduction_config=ReductionConfig(save_shape=True),
     )
     helper_keras_gradtape(trial_dir=out_dir, hook=hook)
-    verify_shapes(out_dir, 0, ["gradients/dense_1/biasGrad", "weights/dense/bias:0", "loss"])
-    verify_shapes(out_dir, 500, ["weights/dense/bias:0", "Adam/learning_rate:0", "loss"])
+    verify_shapes(out_dir, 0)
+    verify_shapes(out_dir, 500)
 
 
 @pytest.mark.skip_if_non_eager
diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py
index c49b65e3d..3d9ab247d 100644
--- a/tests/tensorflow2/test_keras_mirrored.py
+++ b/tests/tensorflow2/test_keras_mirrored.py
@@ -301,17 +301,7 @@ def test_shapes(out_dir, tf_eager_mode):
         steps=["train"],
         eager=tf_eager_mode,
     )
-    verify_shapes(
-        out_dir,
-        0,
-        [
-            "dense_1/weights/dense_1/kernel:0",
-            "scalar/foobar",
-            "dense/weights/dense/bias:0",
-            "Adam/decay:0",
-        ],
-        multiworker=True,
-    )
+    verify_shapes(out_dir, 0, multiworker=True)
 
 
 @pytest.mark.slow
diff --git a/tests/utils.py b/tests/utils.py
index 98e1091f1..af827f264 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -40,9 +40,9 @@ def is_scalar(x):
     return False
 
 
-def verify_shapes(out_dir, step_num, tensornames, multiworker=False):
+def verify_shapes(out_dir, step_num, multiworker=False):
     trial = create_trial_fast_refresh(out_dir)
-    for tname in tensornames:
+    for tname in trial.tensor_names(step=step_num):
         tensor = trial.tensor(tname)
         if multiworker is False:
             assert isinstance(tensor.shape(step_num), tuple), (tname, tensor.shape(step_num))

From 7f3ea4e57b497ea4b15fc60b9eed9f2ea2667b60 Mon Sep 17 00:00:00 2001
From: NihalHarish <nihal42harish@gmail.com>
Date: Wed, 19 Aug 2020 13:47:48 -0700
Subject: [PATCH 26/31] pre-commmit

---
 tests/tensorflow/hooks/test_estimator_modes.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/tensorflow/hooks/test_estimator_modes.py b/tests/tensorflow/hooks/test_estimator_modes.py
index b07ab1914..b7de19d6e 100644
--- a/tests/tensorflow/hooks/test_estimator_modes.py
+++ b/tests/tensorflow/hooks/test_estimator_modes.py
@@ -18,7 +18,6 @@
 import pytest
 import tensorflow as tf
 from tests.analysis.utils import delete_s3_prefix
-from tests.tensorflow.utils import create_trial_fast_refresh
 from tests.utils import verify_shapes
 
 # First Party

From cdf6578b21f20a8ff33fb138337a768bbd773227 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 19 Aug 2020 15:23:07 -0700
Subject: [PATCH 27/31] Fix get_mode

---
 smdebug/core/index_reader.py           |  4 ++--
 smdebug/core/locations.py              |  6 ------
 smdebug/core/writer.py                 |  2 +-
 tests/mxnet/test_hook_reduce_config.py | 26 +++++++++++---------------
 4 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py
index 525d8608e..dfc8eb0d2 100644
--- a/smdebug/core/index_reader.py
+++ b/smdebug/core/index_reader.py
@@ -248,7 +248,7 @@ def _update_tensors_from_json(
 
         to_update_index_dict = []
 
-        if "tensor_payload" in index_dict:
+        if len(index_dict["tensor_payload"]):
             event_file_name = os.path.join(path, index_meta["event_file_name"])
             for tensor in index_dict["tensor_payload"]:
                 tensor_name = tensor["tensorname"]
@@ -259,7 +259,7 @@ def _update_tensors_from_json(
                 )
                 to_update_index_dict.append((tensor_name, step, tensor_location))
 
-        if "shape_payload" in index_dict:
+        if len(index_dict["shape_payload"]):
             for tensor in index_dict["shape_payload"]:
                 tensor_name = tensor["tensorname"]
                 original_name = tensor["originalname"]
diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py
index 03c389224..9712f58a2 100644
--- a/smdebug/core/locations.py
+++ b/smdebug/core/locations.py
@@ -23,9 +23,6 @@ def __init__(self, tname, mode, mode_step, event_file_name, start_idx, length, w
     def to_dict(self):
         return {"tensorname": self.tensorname, "start_idx": self.start_idx, "length": self.length}
 
-    def get_mode(self):
-        return str(self.mode).split(".")[-1]
-
 
 class TensorShape:
     def __init__(self, name, mode, mode_step, shape, original_name=None):
@@ -40,9 +37,6 @@ def __init__(self, name, mode, mode_step, shape, original_name=None):
     def to_dict(self):
         return {"tensorname": self.name, "originalname": self.original_name, "shape": self.shape}
 
-    def get_mode(self):
-        return str(self.mode).split(".")[-1]
-
 
 STEP_NUMBER_FORMATTING_LENGTH = "012"
 
diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py
index 3ed1f6da5..3966da82b 100644
--- a/smdebug/core/writer.py
+++ b/smdebug/core/writer.py
@@ -260,7 +260,7 @@ def write_shape(
         self, name, shape: Tuple[int], mode=ModeKeys.GLOBAL, mode_step=None, original_name=None
     ):
         self._index_writer.add_shape(
-            TensorShape(name, mode, mode_step, shape, original_name=original_name)
+            TensorShape(name, mode.name, mode_step, shape, original_name=original_name)
         )
 
     def flush(self):
diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py
index 1ef2be4a5..46476414d 100644
--- a/tests/mxnet/test_hook_reduce_config.py
+++ b/tests/mxnet/test_hook_reduce_config.py
@@ -86,24 +86,20 @@ def test_save_config(hook=None, out_dir=None):
         shutil.rmtree(out_dir)
 
 
-def test_save_shapes(out_dir, hook=None):
-    hook_created = False
-    if hook is None:
-        hook_created = True
-        global_reduce_config = ReductionConfig(save_shape=True)
-        global_save_config = SaveConfig(save_steps=[0, 1])
-
-        hook = t_hook(
-            out_dir=out_dir,
-            save_config=global_save_config,
-            save_all=True,
-            reduction_config=global_reduce_config,
-        )
+def test_save_shapes(out_dir):
+    global_reduce_config = ReductionConfig(save_shape=True)
+    global_save_config = SaveConfig(save_steps=[0, 1])
+
+    hook = t_hook(
+        out_dir=out_dir,
+        save_config=global_save_config,
+        save_all=True,
+        reduction_config=global_reduce_config,
+    )
     run_mnist_gluon_model(hook=hook, num_steps_train=5)
     verify_shapes(out_dir, 0)
     verify_shapes(out_dir, 1)
-    if hook_created:
-        shutil.rmtree(out_dir)
+    shutil.rmtree(out_dir)
 
 
 def test_save_config_hook_from_json():

From d16d1de0b523b5f778a891f31fb0d847292d8272 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 19 Aug 2020 16:07:27 -0700
Subject: [PATCH 28/31] Fix bug with old index files

---
 smdebug/core/index_reader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py
index dfc8eb0d2..2c182d0d0 100644
--- a/smdebug/core/index_reader.py
+++ b/smdebug/core/index_reader.py
@@ -248,7 +248,7 @@ def _update_tensors_from_json(
 
         to_update_index_dict = []
 
-        if len(index_dict["tensor_payload"]):
+        if "tensor_payload" in index_dict and len(index_dict["tensor_payload"]):
             event_file_name = os.path.join(path, index_meta["event_file_name"])
             for tensor in index_dict["tensor_payload"]:
                 tensor_name = tensor["tensorname"]
@@ -259,7 +259,7 @@ def _update_tensors_from_json(
                 )
                 to_update_index_dict.append((tensor_name, step, tensor_location))
 
-        if len(index_dict["shape_payload"]):
+        if "shape_payload" in index_dict and len(index_dict["shape_payload"]):
             for tensor in index_dict["shape_payload"]:
                 tensor_name = tensor["tensorname"]
                 original_name = tensor["originalname"]

From 384b71c8a46069af79b46564c3288c58a62f7020 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 19 Aug 2020 18:17:08 -0700
Subject: [PATCH 29/31] Fix keras test with names of tensors

---
 tests/tensorflow2/test_keras.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py
index ffa5c9ae8..3df832404 100644
--- a/tests/tensorflow2/test_keras.py
+++ b/tests/tensorflow2/test_keras.py
@@ -474,7 +474,7 @@ def test_keras_fit_shapes(out_dir):
     )
     helper_keras_fit(trial_dir=out_dir, hook=hook)
     print(create_trial_fast_refresh(out_dir).tensor_names(step=0))
-    verify_shapes(out_dir, 0, ["dense/weights/dense/kernel:0", "accuracy", "Adam/beta_1:0"])
+    verify_shapes(out_dir, 0)
 
 
 @pytest.mark.slow

From cd8a4d15f79405f4df1655d3424ad31e3de646b0 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 19 Aug 2020 18:43:11 -0700
Subject: [PATCH 30/31] Set original name to None if tf_obj is None

---
 smdebug/core/hook.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py
index 345ee5317..758782620 100644
--- a/smdebug/core/hook.py
+++ b/smdebug/core/hook.py
@@ -760,7 +760,7 @@ def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=N
             if self.dry_run is False and reduction_config.save_shape is True:
                 numpy_tensor_value = self._make_numpy_array(tensor_value)
                 this_size, this_shape = size_and_shape(numpy_tensor_value)
-                if tensor_ref is not None:
+                if tensor_ref is not None and tensor_ref.tf_obj is not None:
                     original_name = tensor_ref.tf_obj.name
                 else:
                     original_name = None

From c4881b7541d62b276edebc97f9d9b8f0044ccfac Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 19 Aug 2020 19:33:31 -0700
Subject: [PATCH 31/31] Fix mirrored test for cpu

---
 tests/tensorflow2/test_keras_mirrored.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py
index 3d9ab247d..3f0bafe4f 100644
--- a/tests/tensorflow2/test_keras_mirrored.py
+++ b/tests/tensorflow2/test_keras_mirrored.py
@@ -293,7 +293,7 @@ def test_save_all(out_dir, tf_eager_mode, workers):
 
 @pytest.mark.slow
 def test_shapes(out_dir, tf_eager_mode):
-    train_model(
+    strategy, _ = train_model(
         out_dir,
         save_all=True,
         save_config=SaveConfig(save_steps=[0]),
@@ -301,7 +301,8 @@ def test_shapes(out_dir, tf_eager_mode):
         steps=["train"],
         eager=tf_eager_mode,
     )
-    verify_shapes(out_dir, 0, multiworker=True)
+    multiworker = strategy.num_replicas_in_sync > 1
+    verify_shapes(out_dir, 0, multiworker=multiworker)
 
 
 @pytest.mark.slow