From d9f9696fec927228cbe6c5bd7527b499c3f960e8 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Wed, 10 Nov 2021 09:13:32 -0800 Subject: [PATCH 1/9] update reductions --- smdebug/core/hook.py | 55 ++++++++++++++++++++++++-------- smdebug/core/locations.py | 9 ++++-- smdebug/core/reduction_config.py | 35 +++++++++++++++++--- smdebug/core/reductions.py | 14 +++++--- smdebug/pytorch/utils.py | 23 +++++++++---- 5 files changed, 104 insertions(+), 32 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 38ff7c135..92146fc42 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -512,7 +512,7 @@ def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: return [] return self._get_main_writer() - def _maybe_get_tb_writer(self) -> Optional[FileWriter]: + def _maybe_get_tb_writer(self, subfolder=None) -> Optional[FileWriter]: """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None. Creates a writer if does not exist. @@ -520,22 +520,25 @@ def _maybe_get_tb_writer(self) -> Optional[FileWriter]: if not self.tensorboard_dir: return None - if self.mode in self.tb_writers: - assert self.tb_writers[self.mode] is not None + if subfolder == None: + subfolder = self.mode + + if subfolder in self.tb_writers: + assert self.tb_writers[subfolder] is not None # would be there if set_mode was called - return self.tb_writers[self.mode] + return self.tb_writers[subfolder] else: # s = self.step # if s < 0: s = 0 - self.tb_writers[self.mode] = FileWriter( + self.tb_writers[subfolder] = FileWriter( trial_dir=self.tensorboard_dir, step=self.step, worker=get_tb_worker(), write_checksum=True, wtype="tensorboard", - mode=self.mode, + mode=subfolder, ) - return self.tb_writers[self.mode] + return self.tb_writers[subfolder] def _close_tb_writer(self): if self.dry_run: @@ -660,16 +663,32 @@ def export_collections(self): collection_file_name = f"{self.worker}_collections.json" self.collection_manager.export(self.out_dir, collection_file_name) - def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): - return get_reduction_tensor_name(tensor_name, reduction_name, abs, remove_colon_index=True) + def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs, collection_name=""): + return get_reduction_tensor_name( + tensor_name, + reduction_name, + abs, + remove_colon_index=True, + collection_name=collection_name, + ) - def _write_reduction(self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None): - reduction_tensor_name = self._get_reduction_tensor_name(tensor_name, reduction_name, abs) + def _write_reduction( + self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None, collection_name="" + ): + reduction_tensor_name = self._get_reduction_tensor_name( + tensor_name, reduction_name, abs, collection_name=collection_name + ) try: tensor_data = self._get_reduction_of_data( reduction_name, tensor_value, tensor_name, abs ) self._write_raw_tensor_simple(reduction_tensor_name, tensor_data, tensor_ref=tensor_ref) + if abs: + reduction_name = "abs_" + reduction_name + tb_writer = self._maybe_get_tb_writer(subfolder=reduction_name) + if tb_writer: + scalar = self._make_numpy_array(tensor_data) + tb_writer.write_scalar_summary(reduction_tensor_name, scalar, self.step) except ValueError as e: self.logger.warning( f"Could not compute reduction {reduction_name} of {tensor_name} due to {e}" @@ -685,14 +704,24 @@ def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_ for reduction in reduction_list: if (reduction, False) not in reductions_saved: self._write_reduction( - tensor_name, tensor_value, reduction, abs=False, tensor_ref=tensor_ref + tensor_name, + tensor_value, + reduction, + abs=False, + tensor_ref=tensor_ref, + collection_name=s_col.name, ) reductions_saved.add((reduction, False)) for reduction_list in (reduction_config.abs_reductions, reduction_config.abs_norms): for reduction in reduction_list: if (reduction, True) not in reductions_saved: self._write_reduction( - tensor_name, tensor_value, reduction, abs=True, tensor_ref=tensor_ref + tensor_name, + tensor_value, + reduction, + abs=True, + tensor_ref=tensor_ref, + collection_name=s_col.name, ) reductions_saved.add((reduction, True)) diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py index 0ee17b7cf..6146eb80c 100644 --- a/smdebug/core/locations.py +++ b/smdebug/core/locations.py @@ -120,11 +120,14 @@ def __init__(self, step_num, worker_name, mode=None): def get_file_location(self, base_dir=""): # when base_dir is empty it just returns the relative file path + if hasattr(self.mode, "name"): + subfolder = self.mode.name + else: + subfolder = self.mode if base_dir: - event_key_prefix = os.path.join(base_dir, self.mode.name) + event_key_prefix = os.path.join(base_dir, subfolder) else: - event_key_prefix = os.path.join(self.type, self.mode.name) - + event_key_prefix = os.path.join(self.type, subfolder) return os.path.join(event_key_prefix, self.get_filename()) diff --git a/smdebug/core/reduction_config.py b/smdebug/core/reduction_config.py index 1fa6121b6..955ed68c0 100644 --- a/smdebug/core/reduction_config.py +++ b/smdebug/core/reduction_config.py @@ -3,13 +3,25 @@ from typing import Any, Dict # First Party +from smdebug.analysis.utils import parse_bool from smdebug.core.logger import get_logger from smdebug.core.utils import split logger = get_logger() -ALLOWED_REDUCTIONS = ["min", "max", "mean", "std", "variance", "sum", "prod"] +ALLOWED_REDUCTIONS = [ + "min", + "max", + "mean", + "std", + "variance", + "sum", + "prod", + "isnan", + "isinf", + "quantile", +] ALLOWED_NORMS = ["l1", "l2"] REDUCTION_CONFIG_VERSION_NUM = "v0" ALLOWED_PARAMS = [ @@ -66,7 +78,7 @@ def __init__( self.abs_reductions = abs_reductions if abs_reductions is not None else [] self.norms = norms if norms is not None else [] self.abs_norms = abs_norms if abs_norms is not None else [] - self.save_raw_tensor = save_raw_tensor + self.save_raw_tensor = parse_bool(save_raw_tensor, True) self.save_shape = save_shape ## DO NOT REMOVE, if you add anything here, please make sure that _check & from_json is updated accordingly self._check() @@ -77,11 +89,24 @@ def _check(self): raise ValueError( "allowed params for reduction config can only be one of " + ",".join(ALLOWED_PARAMS) ) - - if any([x not in ALLOWED_REDUCTIONS for x in self.reductions]): + for index, reduction_allowed in enumerate( + [x in ALLOWED_REDUCTIONS for x in self.reductions] + ): + if reduction_allowed or self.reductions[index].startswith("quantile"): + continue raise ValueError("reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) - if any([x not in ALLOWED_REDUCTIONS for x in self.abs_reductions]): + + for index, reduction_allowed in enumerate( + [x in ALLOWED_REDUCTIONS for x in self.abs_reductions] + ): + if reduction_allowed or self.abs_reductions[index].startswith("quantile"): + continue raise ValueError("abs_reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) + + # if any([x not in ALLOWED_REDUCTIONS for x in self.reductions]): + # raise ValueError("reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) + # if any([x not in ALLOWED_REDUCTIONS for x in self.abs_reductions]): + # raise ValueError("abs_reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) if any([x not in ALLOWED_NORMS for x in self.norms]): raise ValueError("norms can only be one of " + ",".join(ALLOWED_NORMS)) if any([x not in ALLOWED_NORMS for x in self.abs_norms]): diff --git a/smdebug/core/reductions.py b/smdebug/core/reductions.py index 3be6c98b5..15c04cbc5 100644 --- a/smdebug/core/reductions.py +++ b/smdebug/core/reductions.py @@ -42,15 +42,19 @@ def get_basic_numpy_reduction(reduction_name, numpy_data): return None -def get_reduction_tensor_name(tensorname, reduction_name, abs, remove_colon_index=True): +def get_reduction_tensor_name( + tensorname, reduction_name, abs, remove_colon_index=True, collection_name="smdebug" +): # for frameworks other than TF, it makes sense to not have trailing :0, :1 # but for TF, it makes sense to keep it consistent with TF traditional naming style - tname = f"{reduction_name}/{tensorname}" + # tname = f"{reduction_name}/{tensorname}" + tname = f"{tensorname}" if remove_colon_index: tname = re.sub(r":\d+", "", tname) - if abs: - tname = "abs_" + tname - tname = REDUCTIONS_PREFIX + tname + # if abs: + # tname = "abs_" + tname + # tname = REDUCTIONS_PREFIX + tname + tname = collection_name + "/reductions/" + tname return tname diff --git a/smdebug/pytorch/utils.py b/smdebug/pytorch/utils.py index 7829dde0f..777a6e7ae 100644 --- a/smdebug/pytorch/utils.py +++ b/smdebug/pytorch/utils.py @@ -19,14 +19,25 @@ def get_reduction_of_data(reduction_name, tensor_data, tensor_name, abs=False): return get_numpy_reduction(reduction_name, tensor_data, abs) if abs: tensor_data = torch.abs(tensor_data) - + if reduction_name.startswith("quantile") and hasattr(torch, "quantile"): + f = getattr(torch, "quantile") + value = float(reduction_name.replace("quantile", "")[1]) / 100 + op = f(tensor_data.float(), value) + return op if reduction_name in ALLOWED_REDUCTIONS: if reduction_name == "variance": reduction_name = "var" - assert hasattr(torch.Tensor, reduction_name) - f = getattr(torch.Tensor, reduction_name) - op = f(tensor_data) - return op + if hasattr(torch.Tensor, reduction_name): + f = getattr(torch.Tensor, reduction_name) + op = f(tensor_data.float()) + if reduction_name == "isnan" or reduction_name == "isinf": + op = torch.sum(op) + return op + if hasattr(torch, reduction_name): + f = getattr(torch, reduction_name) + op = f(tensor_data) + op = torch.sum(op) + return op elif reduction_name in ALLOWED_NORMS: if reduction_name in ["l1", "l2"]: ord = int(reduction_name[1]) @@ -34,7 +45,7 @@ def get_reduction_of_data(reduction_name, tensor_data, tensor_name, abs=False): raise RuntimeError( "Invalid normalization operation {0} for torch.Tensor".format(reduction_name) ) - op = torch.norm(tensor_data, p=ord) + op = torch.norm(tensor_data.float(), p=ord) return op elif hasattr(torch, reduction_name): f = getattr(torch, reduction_name) From 0fcb161cd7424e142627a3fc3a9be1088f19500a Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Sun, 14 Nov 2021 14:42:42 -0800 Subject: [PATCH 2/9] update tensor names --- smdebug/core/reductions.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/smdebug/core/reductions.py b/smdebug/core/reductions.py index 15c04cbc5..40eb472a0 100644 --- a/smdebug/core/reductions.py +++ b/smdebug/core/reductions.py @@ -1,5 +1,6 @@ # Standard Library import re +import socket # Third Party import numpy as np @@ -47,14 +48,10 @@ def get_reduction_tensor_name( ): # for frameworks other than TF, it makes sense to not have trailing :0, :1 # but for TF, it makes sense to keep it consistent with TF traditional naming style - # tname = f"{reduction_name}/{tensorname}" tname = f"{tensorname}" if remove_colon_index: tname = re.sub(r":\d+", "", tname) - # if abs: - # tname = "abs_" + tname - # tname = REDUCTIONS_PREFIX + tname - tname = collection_name + "/reductions/" + tname + tname = collection_name + "/reductions/" + tname + "_" + socket.gethostname() return tname From c24fd2eb84453b9fbc442ab6e34c5b3fa9dc1e18 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Sun, 14 Nov 2021 14:45:37 -0800 Subject: [PATCH 3/9] update reduction_config --- smdebug/core/reduction_config.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/smdebug/core/reduction_config.py b/smdebug/core/reduction_config.py index 955ed68c0..d107023b0 100644 --- a/smdebug/core/reduction_config.py +++ b/smdebug/core/reduction_config.py @@ -103,10 +103,6 @@ def _check(self): continue raise ValueError("abs_reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) - # if any([x not in ALLOWED_REDUCTIONS for x in self.reductions]): - # raise ValueError("reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) - # if any([x not in ALLOWED_REDUCTIONS for x in self.abs_reductions]): - # raise ValueError("abs_reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) if any([x not in ALLOWED_NORMS for x in self.norms]): raise ValueError("norms can only be one of " + ",".join(ALLOWED_NORMS)) if any([x not in ALLOWED_NORMS for x in self.abs_norms]): From ff9dd6396b42563e65078120ed7410ba2659ae8a Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Sun, 21 Nov 2021 16:15:35 -0800 Subject: [PATCH 4/9] bugfix for tf reductions --- smdebug/tensorflow/base_hook.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 82044b6ea..5255f4fd5 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -464,8 +464,14 @@ def _get_collections_with_tensor(self, tf_tensor_name) -> Set["Collection"]: return super()._get_collections_with_tensor(tf_tensor_name) return self.tensor_to_collections[tf_tensor_name] - def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): - return get_reduction_tensor_name(tensor_name, reduction_name, abs, remove_colon_index=False) + def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs, collection_name=""): + return get_reduction_tensor_name( + tensor_name, + reduction_name, + abs, + remove_colon_index=False, + collection_name=collection_name, + ) def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_ref=None): # When TF 2.x GradientTape is used, the tensors to be saved are of type From 316087eef63fb93f1e4c22d81aa26dd2ebd29773 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Mon, 22 Nov 2021 08:49:59 -0800 Subject: [PATCH 5/9] fixed tests --- tests/mxnet/test_hook_reduce_config.py | 2 +- tests/tensorflow2/test_keras.py | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py index 46476414d..332d97b56 100644 --- a/tests/mxnet/test_hook_reduce_config.py +++ b/tests/mxnet/test_hook_reduce_config.py @@ -76,7 +76,7 @@ def test_save_config(hook=None, out_dir=None): assert abs_max_val is not None # Custom reduction with normalization - tname = tr.tensor_names(regex=r"flatten\d+_input_0")[0] + tname = tr.tensor_names(regex=r"flatten.*flatten\d+_input_0")[0] flatten_input = tr.tensor(tname) l1_norm = flatten_input.reduction_value(step_num=4, abs=False, reduction_name="l1") assert l1_norm is not None diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index bf481d447..80628501f 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -27,7 +27,7 @@ from smdebug.core.collection import CollectionKeys from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR from smdebug.core.modes import ModeKeys -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailableForStep from smdebug.profiler.profiler_constants import DEFAULT_PREFIX from smdebug.tensorflow import ReductionConfig, SaveConfig @@ -249,10 +249,11 @@ def test_gradtape_base_reductions(out_dir): """ Test reduction config """ + reductions = ["min", "max", "mean", "std", "sum", "prod"] helper_keras_gradtape( trial_dir=out_dir, include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions), ) tr = create_trial_fast_refresh(out_dir) weight_name = tr.tensor_names(collection=CollectionKeys.WEIGHTS)[0] @@ -261,7 +262,7 @@ def test_gradtape_base_reductions(out_dir): assert False except TensorUnavailableForStep: assert tr.tensor(weight_name).reduction_value(0, "l1") is not None - assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len( + assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len( ALLOWED_NORMS ) @@ -380,7 +381,9 @@ def test_gradtape_include_collections(out_dir): out_dir, save_config=save_config, include_collections=include_collections, - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"] + ), ) helper_keras_gradtape(out_dir, hook=hook) @@ -527,10 +530,11 @@ def test_keras_fit_shapes(out_dir): @pytest.mark.slow def test_base_reductions(out_dir, tf_eager_mode): + reductions = ["min", "max", "mean", "std", "sum", "prod"] helper_keras_fit( trial_dir=out_dir, include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions), run_eagerly=tf_eager_mode, ) tr = create_trial_fast_refresh(out_dir) @@ -540,7 +544,7 @@ def test_base_reductions(out_dir, tf_eager_mode): assert False except TensorUnavailableForStep: assert tr.tensor(weight_name).reduction_value(0, "l1") is not None - assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len( + assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len( ALLOWED_NORMS ) @@ -719,7 +723,9 @@ def test_include_collections(out_dir, tf_eager_mode): out_dir, save_config=save_config, include_collections=include_collections, - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"] + ), ) hook.get_collection("custom_optimizer_variables").include("Adam") helper_keras_fit( @@ -756,7 +762,9 @@ def test_include_only_custom_collection(out_dir, tf_eager_mode): out_dir, save_config=save_config, include_collections=include_collections, - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"] + ), ) hook.get_collection("custom_optimizer_variables").include("Adam") helper_keras_fit( From ce6e8809a9f98d2067f745ab627d45c49339d87c Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Mon, 22 Nov 2021 18:38:41 -0800 Subject: [PATCH 6/9] changed tensornames --- smdebug/core/hook.py | 6 +++--- smdebug/core/reductions.py | 9 +++++---- smdebug/tensorflow/base_hook.py | 3 +-- tests/mxnet/test_hook_reduce_config.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 92146fc42..c68bf15e3 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -663,20 +663,19 @@ def export_collections(self): collection_file_name = f"{self.worker}_collections.json" self.collection_manager.export(self.out_dir, collection_file_name) - def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs, collection_name=""): + def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): return get_reduction_tensor_name( tensor_name, reduction_name, abs, remove_colon_index=True, - collection_name=collection_name, ) def _write_reduction( self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None, collection_name="" ): reduction_tensor_name = self._get_reduction_tensor_name( - tensor_name, reduction_name, abs, collection_name=collection_name + tensor_name, reduction_name, abs ) try: tensor_data = self._get_reduction_of_data( @@ -687,6 +686,7 @@ def _write_reduction( reduction_name = "abs_" + reduction_name tb_writer = self._maybe_get_tb_writer(subfolder=reduction_name) if tb_writer: + reduction_tensor_name = collection_name + "/reductions/" + tensor_name + "/" + self.worker scalar = self._make_numpy_array(tensor_data) tb_writer.write_scalar_summary(reduction_tensor_name, scalar, self.step) except ValueError as e: diff --git a/smdebug/core/reductions.py b/smdebug/core/reductions.py index 40eb472a0..7228c770a 100644 --- a/smdebug/core/reductions.py +++ b/smdebug/core/reductions.py @@ -1,6 +1,5 @@ # Standard Library import re -import socket # Third Party import numpy as np @@ -44,14 +43,16 @@ def get_basic_numpy_reduction(reduction_name, numpy_data): def get_reduction_tensor_name( - tensorname, reduction_name, abs, remove_colon_index=True, collection_name="smdebug" + tensorname, reduction_name, abs, remove_colon_index=True ): # for frameworks other than TF, it makes sense to not have trailing :0, :1 # but for TF, it makes sense to keep it consistent with TF traditional naming style - tname = f"{tensorname}" + tname = f"{reduction_name}/{tensorname}" if remove_colon_index: tname = re.sub(r":\d+", "", tname) - tname = collection_name + "/reductions/" + tname + "_" + socket.gethostname() + if abs: + tname = "abs_" + tname + tname = REDUCTIONS_PREFIX + tname return tname diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 5255f4fd5..48756a9a8 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -464,13 +464,12 @@ def _get_collections_with_tensor(self, tf_tensor_name) -> Set["Collection"]: return super()._get_collections_with_tensor(tf_tensor_name) return self.tensor_to_collections[tf_tensor_name] - def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs, collection_name=""): + def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): return get_reduction_tensor_name( tensor_name, reduction_name, abs, remove_colon_index=False, - collection_name=collection_name, ) def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_ref=None): diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py index 332d97b56..37af66a3b 100644 --- a/tests/mxnet/test_hook_reduce_config.py +++ b/tests/mxnet/test_hook_reduce_config.py @@ -76,7 +76,7 @@ def test_save_config(hook=None, out_dir=None): assert abs_max_val is not None # Custom reduction with normalization - tname = tr.tensor_names(regex=r"flatten.*flatten\d+_input_0")[0] + tname = tr.tensor_names(regex=r"*flatten\d+_input_0")[0] flatten_input = tr.tensor(tname) l1_norm = flatten_input.reduction_value(step_num=4, abs=False, reduction_name="l1") assert l1_norm is not None From 3d52a9acc224817afd500c3b484a378124605375 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Mon, 22 Nov 2021 18:43:38 -0800 Subject: [PATCH 7/9] minor bugfixes --- smdebug/core/hook.py | 4 ++-- smdebug/tensorflow/base_hook.py | 2 +- tests/mxnet/test_hook_reduce_config.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index c68bf15e3..36e66c629 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -709,7 +709,7 @@ def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_ reduction, abs=False, tensor_ref=tensor_ref, - collection_name=s_col.name, + collection_name=s_col.name ) reductions_saved.add((reduction, False)) for reduction_list in (reduction_config.abs_reductions, reduction_config.abs_norms): @@ -721,7 +721,7 @@ def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_ reduction, abs=True, tensor_ref=tensor_ref, - collection_name=s_col.name, + collection_name=s_col.name ) reductions_saved.add((reduction, True)) diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 48756a9a8..06d7b05b5 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -469,7 +469,7 @@ def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): tensor_name, reduction_name, abs, - remove_colon_index=False, + remove_colon_index=False ) def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_ref=None): diff --git a/tests/mxnet/test_hook_reduce_config.py b/tests/mxnet/test_hook_reduce_config.py index 37af66a3b..46476414d 100644 --- a/tests/mxnet/test_hook_reduce_config.py +++ b/tests/mxnet/test_hook_reduce_config.py @@ -76,7 +76,7 @@ def test_save_config(hook=None, out_dir=None): assert abs_max_val is not None # Custom reduction with normalization - tname = tr.tensor_names(regex=r"*flatten\d+_input_0")[0] + tname = tr.tensor_names(regex=r"flatten\d+_input_0")[0] flatten_input = tr.tensor(tname) l1_norm = flatten_input.reduction_value(step_num=4, abs=False, reduction_name="l1") assert l1_norm is not None From 61e7537892ebedbe46c8d879079643a7c7bbb6a5 Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Mon, 22 Nov 2021 18:46:04 -0800 Subject: [PATCH 8/9] pre-commit --- smdebug/core/hook.py | 19 +++++++------------ smdebug/core/reductions.py | 4 +--- smdebug/tensorflow/base_hook.py | 7 +------ 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 36e66c629..733c80ca7 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -664,19 +664,12 @@ def export_collections(self): self.collection_manager.export(self.out_dir, collection_file_name) def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): - return get_reduction_tensor_name( - tensor_name, - reduction_name, - abs, - remove_colon_index=True, - ) + return get_reduction_tensor_name(tensor_name, reduction_name, abs, remove_colon_index=True) def _write_reduction( self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None, collection_name="" ): - reduction_tensor_name = self._get_reduction_tensor_name( - tensor_name, reduction_name, abs - ) + reduction_tensor_name = self._get_reduction_tensor_name(tensor_name, reduction_name, abs) try: tensor_data = self._get_reduction_of_data( reduction_name, tensor_value, tensor_name, abs @@ -686,7 +679,9 @@ def _write_reduction( reduction_name = "abs_" + reduction_name tb_writer = self._maybe_get_tb_writer(subfolder=reduction_name) if tb_writer: - reduction_tensor_name = collection_name + "/reductions/" + tensor_name + "/" + self.worker + reduction_tensor_name = ( + collection_name + "/reductions/" + tensor_name + "/" + self.worker + ) scalar = self._make_numpy_array(tensor_data) tb_writer.write_scalar_summary(reduction_tensor_name, scalar, self.step) except ValueError as e: @@ -709,7 +704,7 @@ def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_ reduction, abs=False, tensor_ref=tensor_ref, - collection_name=s_col.name + collection_name=s_col.name, ) reductions_saved.add((reduction, False)) for reduction_list in (reduction_config.abs_reductions, reduction_config.abs_norms): @@ -721,7 +716,7 @@ def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_ reduction, abs=True, tensor_ref=tensor_ref, - collection_name=s_col.name + collection_name=s_col.name, ) reductions_saved.add((reduction, True)) diff --git a/smdebug/core/reductions.py b/smdebug/core/reductions.py index 7228c770a..3be6c98b5 100644 --- a/smdebug/core/reductions.py +++ b/smdebug/core/reductions.py @@ -42,9 +42,7 @@ def get_basic_numpy_reduction(reduction_name, numpy_data): return None -def get_reduction_tensor_name( - tensorname, reduction_name, abs, remove_colon_index=True -): +def get_reduction_tensor_name(tensorname, reduction_name, abs, remove_colon_index=True): # for frameworks other than TF, it makes sense to not have trailing :0, :1 # but for TF, it makes sense to keep it consistent with TF traditional naming style tname = f"{reduction_name}/{tensorname}" diff --git a/smdebug/tensorflow/base_hook.py b/smdebug/tensorflow/base_hook.py index 06d7b05b5..82044b6ea 100644 --- a/smdebug/tensorflow/base_hook.py +++ b/smdebug/tensorflow/base_hook.py @@ -465,12 +465,7 @@ def _get_collections_with_tensor(self, tf_tensor_name) -> Set["Collection"]: return self.tensor_to_collections[tf_tensor_name] def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): - return get_reduction_tensor_name( - tensor_name, - reduction_name, - abs, - remove_colon_index=False - ) + return get_reduction_tensor_name(tensor_name, reduction_name, abs, remove_colon_index=False) def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_ref=None): # When TF 2.x GradientTape is used, the tensors to be saved are of type From 66999aaaaccd83f237cd299be5624303cd8d028c Mon Sep 17 00:00:00 2001 From: NRauschmayr Date: Mon, 22 Nov 2021 19:32:34 -0800 Subject: [PATCH 9/9] updated tf testcases --- tests/tensorflow/hooks/test_reductions.py | 8 ++++---- tests/tensorflow/keras/test_keras.py | 7 ++++--- tests/tensorflow/keras/test_keras_mirrored.py | 6 ++++-- tests/tensorflow2/test_keras_mirrored.py | 6 ++++-- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py index e009f4565..aaa957b9c 100644 --- a/tests/tensorflow/hooks/test_reductions.py +++ b/tests/tensorflow/hooks/test_reductions.py @@ -6,7 +6,7 @@ # First Party import smdebug.tensorflow as smd from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailableForStep # Local @@ -37,7 +37,7 @@ def helper_test_reductions(trial_dir, hook, save_raw_tensor): except TensorUnavailableForStep as e: pass assert len(t.reduction_values(0)) == 18 - for r in ALLOWED_REDUCTIONS + ALLOWED_NORMS: + for r in ["min", "max", "mean", "std", "variance", "sum", "prod"] + ALLOWED_NORMS: for b in [False, True]: assert t.reduction_value(0, reduction_name=r, abs=b, worker=None) is not None @@ -45,8 +45,8 @@ def helper_test_reductions(trial_dir, hook, save_raw_tensor): def test_reductions(out_dir, save_raw_tensor=False): pre_test_clean_up() rdnc = smd.ReductionConfig( - reductions=ALLOWED_REDUCTIONS, - abs_reductions=ALLOWED_REDUCTIONS, + reductions=["min", "max", "mean", "std", "variance", "sum", "prod"], + abs_reductions=["min", "max", "mean", "std", "variance", "sum", "prod"], norms=ALLOWED_NORMS, abs_norms=ALLOWED_NORMS, save_raw_tensor=save_raw_tensor, diff --git a/tests/tensorflow/keras/test_keras.py b/tests/tensorflow/keras/test_keras.py index bfd5e7cc7..836472ec7 100644 --- a/tests/tensorflow/keras/test_keras.py +++ b/tests/tensorflow/keras/test_keras.py @@ -11,7 +11,7 @@ from smdebug.core.access_layer import has_training_ended from smdebug.core.collection import CollectionKeys from smdebug.core.modes import ModeKeys -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig from smdebug.tensorflow.keras import KerasHook @@ -275,10 +275,11 @@ def test_save_all(out_dir): @pytest.mark.slow # 0:03 to run def test_base_reductions(out_dir): + reductions = ["min", "max", "mean", "std", "variance", "sum", "prod"] train_model( out_dir, include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions), steps=["train"], ) tr = create_trial_fast_refresh(out_dir) @@ -288,7 +289,7 @@ def test_base_reductions(out_dir): assert False except TensorUnavailableForStep: assert tr.tensor(weight_name).reduction_value(0, "l1") is not None - assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len( + assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len( ALLOWED_NORMS ) diff --git a/tests/tensorflow/keras/test_keras_mirrored.py b/tests/tensorflow/keras/test_keras_mirrored.py index 469f19a6c..8405e7165 100644 --- a/tests/tensorflow/keras/test_keras_mirrored.py +++ b/tests/tensorflow/keras/test_keras_mirrored.py @@ -16,7 +16,7 @@ from smdebug.core.access_layer import has_training_ended from smdebug.core.collection import CollectionKeys from smdebug.core.modes import ModeKeys -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailable, TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig from smdebug.tensorflow.keras import KerasHook @@ -408,7 +408,9 @@ def test_base_reductions(out_dir): CollectionKeys.METRICS, CollectionKeys.LOSSES, ], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "variance", "sum", "prod"] + ), steps=["train"], ) diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index 7c01c18bc..df48afaf8 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -20,7 +20,7 @@ from smdebug.core.access_layer import has_training_ended from smdebug.core.collection import CollectionKeys from smdebug.core.modes import ModeKeys -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailable, TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig from smdebug.tensorflow.keras import KerasHook @@ -321,7 +321,9 @@ def test_base_reductions(out_dir, tf_eager_mode): CollectionKeys.METRICS, CollectionKeys.LOSSES, ], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "variance", "sum", "prod"] + ), steps=["train"], eager=tf_eager_mode, )