diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 38ff7c135..733c80ca7 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -512,7 +512,7 @@ def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]: return [] return self._get_main_writer() - def _maybe_get_tb_writer(self) -> Optional[FileWriter]: + def _maybe_get_tb_writer(self, subfolder=None) -> Optional[FileWriter]: """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None. Creates a writer if does not exist. @@ -520,22 +520,25 @@ def _maybe_get_tb_writer(self) -> Optional[FileWriter]: if not self.tensorboard_dir: return None - if self.mode in self.tb_writers: - assert self.tb_writers[self.mode] is not None + if subfolder == None: + subfolder = self.mode + + if subfolder in self.tb_writers: + assert self.tb_writers[subfolder] is not None # would be there if set_mode was called - return self.tb_writers[self.mode] + return self.tb_writers[subfolder] else: # s = self.step # if s < 0: s = 0 - self.tb_writers[self.mode] = FileWriter( + self.tb_writers[subfolder] = FileWriter( trial_dir=self.tensorboard_dir, step=self.step, worker=get_tb_worker(), write_checksum=True, wtype="tensorboard", - mode=self.mode, + mode=subfolder, ) - return self.tb_writers[self.mode] + return self.tb_writers[subfolder] def _close_tb_writer(self): if self.dry_run: @@ -663,13 +666,24 @@ def export_collections(self): def _get_reduction_tensor_name(self, tensor_name, reduction_name, abs): return get_reduction_tensor_name(tensor_name, reduction_name, abs, remove_colon_index=True) - def _write_reduction(self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None): + def _write_reduction( + self, tensor_name, tensor_value, reduction_name, abs, tensor_ref=None, collection_name="" + ): reduction_tensor_name = self._get_reduction_tensor_name(tensor_name, reduction_name, abs) try: tensor_data = self._get_reduction_of_data( reduction_name, tensor_value, tensor_name, abs ) self._write_raw_tensor_simple(reduction_tensor_name, tensor_data, tensor_ref=tensor_ref) + if abs: + reduction_name = "abs_" + reduction_name + tb_writer = self._maybe_get_tb_writer(subfolder=reduction_name) + if tb_writer: + reduction_tensor_name = ( + collection_name + "/reductions/" + tensor_name + "/" + self.worker + ) + scalar = self._make_numpy_array(tensor_data) + tb_writer.write_scalar_summary(reduction_tensor_name, scalar, self.step) except ValueError as e: self.logger.warning( f"Could not compute reduction {reduction_name} of {tensor_name} due to {e}" @@ -685,14 +699,24 @@ def _write_reductions(self, tensor_name, tensor_value, save_collections, tensor_ for reduction in reduction_list: if (reduction, False) not in reductions_saved: self._write_reduction( - tensor_name, tensor_value, reduction, abs=False, tensor_ref=tensor_ref + tensor_name, + tensor_value, + reduction, + abs=False, + tensor_ref=tensor_ref, + collection_name=s_col.name, ) reductions_saved.add((reduction, False)) for reduction_list in (reduction_config.abs_reductions, reduction_config.abs_norms): for reduction in reduction_list: if (reduction, True) not in reductions_saved: self._write_reduction( - tensor_name, tensor_value, reduction, abs=True, tensor_ref=tensor_ref + tensor_name, + tensor_value, + reduction, + abs=True, + tensor_ref=tensor_ref, + collection_name=s_col.name, ) reductions_saved.add((reduction, True)) diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py index 0ee17b7cf..6146eb80c 100644 --- a/smdebug/core/locations.py +++ b/smdebug/core/locations.py @@ -120,11 +120,14 @@ def __init__(self, step_num, worker_name, mode=None): def get_file_location(self, base_dir=""): # when base_dir is empty it just returns the relative file path + if hasattr(self.mode, "name"): + subfolder = self.mode.name + else: + subfolder = self.mode if base_dir: - event_key_prefix = os.path.join(base_dir, self.mode.name) + event_key_prefix = os.path.join(base_dir, subfolder) else: - event_key_prefix = os.path.join(self.type, self.mode.name) - + event_key_prefix = os.path.join(self.type, subfolder) return os.path.join(event_key_prefix, self.get_filename()) diff --git a/smdebug/core/reduction_config.py b/smdebug/core/reduction_config.py index 1fa6121b6..d107023b0 100644 --- a/smdebug/core/reduction_config.py +++ b/smdebug/core/reduction_config.py @@ -3,13 +3,25 @@ from typing import Any, Dict # First Party +from smdebug.analysis.utils import parse_bool from smdebug.core.logger import get_logger from smdebug.core.utils import split logger = get_logger() -ALLOWED_REDUCTIONS = ["min", "max", "mean", "std", "variance", "sum", "prod"] +ALLOWED_REDUCTIONS = [ + "min", + "max", + "mean", + "std", + "variance", + "sum", + "prod", + "isnan", + "isinf", + "quantile", +] ALLOWED_NORMS = ["l1", "l2"] REDUCTION_CONFIG_VERSION_NUM = "v0" ALLOWED_PARAMS = [ @@ -66,7 +78,7 @@ def __init__( self.abs_reductions = abs_reductions if abs_reductions is not None else [] self.norms = norms if norms is not None else [] self.abs_norms = abs_norms if abs_norms is not None else [] - self.save_raw_tensor = save_raw_tensor + self.save_raw_tensor = parse_bool(save_raw_tensor, True) self.save_shape = save_shape ## DO NOT REMOVE, if you add anything here, please make sure that _check & from_json is updated accordingly self._check() @@ -77,11 +89,20 @@ def _check(self): raise ValueError( "allowed params for reduction config can only be one of " + ",".join(ALLOWED_PARAMS) ) - - if any([x not in ALLOWED_REDUCTIONS for x in self.reductions]): + for index, reduction_allowed in enumerate( + [x in ALLOWED_REDUCTIONS for x in self.reductions] + ): + if reduction_allowed or self.reductions[index].startswith("quantile"): + continue raise ValueError("reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) - if any([x not in ALLOWED_REDUCTIONS for x in self.abs_reductions]): + + for index, reduction_allowed in enumerate( + [x in ALLOWED_REDUCTIONS for x in self.abs_reductions] + ): + if reduction_allowed or self.abs_reductions[index].startswith("quantile"): + continue raise ValueError("abs_reductions can only be one of " + ",".join(ALLOWED_REDUCTIONS)) + if any([x not in ALLOWED_NORMS for x in self.norms]): raise ValueError("norms can only be one of " + ",".join(ALLOWED_NORMS)) if any([x not in ALLOWED_NORMS for x in self.abs_norms]): diff --git a/smdebug/pytorch/utils.py b/smdebug/pytorch/utils.py index 24a58d678..09d654e8c 100644 --- a/smdebug/pytorch/utils.py +++ b/smdebug/pytorch/utils.py @@ -21,14 +21,25 @@ def get_reduction_of_data(reduction_name, tensor_data, tensor_name, abs=False): return get_numpy_reduction(reduction_name, tensor_data, abs) if abs: tensor_data = torch.abs(tensor_data) - + if reduction_name.startswith("quantile") and hasattr(torch, "quantile"): + f = getattr(torch, "quantile") + value = float(reduction_name.replace("quantile", "")[1]) / 100 + op = f(tensor_data.float(), value) + return op if reduction_name in ALLOWED_REDUCTIONS: if reduction_name == "variance": reduction_name = "var" - assert hasattr(torch.Tensor, reduction_name) - f = getattr(torch.Tensor, reduction_name) - op = f(tensor_data) - return op + if hasattr(torch.Tensor, reduction_name): + f = getattr(torch.Tensor, reduction_name) + op = f(tensor_data.float()) + if reduction_name == "isnan" or reduction_name == "isinf": + op = torch.sum(op) + return op + if hasattr(torch, reduction_name): + f = getattr(torch, reduction_name) + op = f(tensor_data) + op = torch.sum(op) + return op elif reduction_name in ALLOWED_NORMS: if reduction_name in ["l1", "l2"]: ord = int(reduction_name[1]) @@ -36,7 +47,7 @@ def get_reduction_of_data(reduction_name, tensor_data, tensor_name, abs=False): raise RuntimeError( "Invalid normalization operation {0} for torch.Tensor".format(reduction_name) ) - op = torch.norm(tensor_data, p=ord) + op = torch.norm(tensor_data.float(), p=ord) return op elif hasattr(torch, reduction_name): f = getattr(torch, reduction_name) diff --git a/tests/tensorflow/hooks/test_reductions.py b/tests/tensorflow/hooks/test_reductions.py index e009f4565..aaa957b9c 100644 --- a/tests/tensorflow/hooks/test_reductions.py +++ b/tests/tensorflow/hooks/test_reductions.py @@ -6,7 +6,7 @@ # First Party import smdebug.tensorflow as smd from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailableForStep # Local @@ -37,7 +37,7 @@ def helper_test_reductions(trial_dir, hook, save_raw_tensor): except TensorUnavailableForStep as e: pass assert len(t.reduction_values(0)) == 18 - for r in ALLOWED_REDUCTIONS + ALLOWED_NORMS: + for r in ["min", "max", "mean", "std", "variance", "sum", "prod"] + ALLOWED_NORMS: for b in [False, True]: assert t.reduction_value(0, reduction_name=r, abs=b, worker=None) is not None @@ -45,8 +45,8 @@ def helper_test_reductions(trial_dir, hook, save_raw_tensor): def test_reductions(out_dir, save_raw_tensor=False): pre_test_clean_up() rdnc = smd.ReductionConfig( - reductions=ALLOWED_REDUCTIONS, - abs_reductions=ALLOWED_REDUCTIONS, + reductions=["min", "max", "mean", "std", "variance", "sum", "prod"], + abs_reductions=["min", "max", "mean", "std", "variance", "sum", "prod"], norms=ALLOWED_NORMS, abs_norms=ALLOWED_NORMS, save_raw_tensor=save_raw_tensor, diff --git a/tests/tensorflow/keras/test_keras.py b/tests/tensorflow/keras/test_keras.py index 949f7539c..37b99092b 100644 --- a/tests/tensorflow/keras/test_keras.py +++ b/tests/tensorflow/keras/test_keras.py @@ -295,10 +295,11 @@ def test_save_all(out_dir): @pytest.mark.slow # 0:03 to run def test_base_reductions(out_dir): + reductions = ["min", "max", "mean", "std", "variance", "sum", "prod"] train_model( out_dir, include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions), steps=["train"], ) tr = create_trial_fast_refresh(out_dir) @@ -308,7 +309,7 @@ def test_base_reductions(out_dir): assert False except TensorUnavailableForStep: assert tr.tensor(weight_name).reduction_value(0, "l1") is not None - assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len( + assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len( ALLOWED_NORMS ) diff --git a/tests/tensorflow/keras/test_keras_mirrored.py b/tests/tensorflow/keras/test_keras_mirrored.py index 661d7f101..8f7d3f63a 100644 --- a/tests/tensorflow/keras/test_keras_mirrored.py +++ b/tests/tensorflow/keras/test_keras_mirrored.py @@ -16,7 +16,7 @@ from smdebug.core.access_layer import has_training_ended from smdebug.core.collection import CollectionKeys from smdebug.core.modes import ModeKeys -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailable, TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig from smdebug.tensorflow.keras import KerasHook @@ -411,7 +411,9 @@ def test_base_reductions(out_dir): CollectionKeys.METRICS, CollectionKeys.LOSSES, ], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "variance", "sum", "prod"] + ), steps=["train"], ) diff --git a/tests/tensorflow2/test_keras.py b/tests/tensorflow2/test_keras.py index f379d07d3..bd8af08eb 100644 --- a/tests/tensorflow2/test_keras.py +++ b/tests/tensorflow2/test_keras.py @@ -26,7 +26,7 @@ from smdebug.core.collection import CollectionKeys from smdebug.core.json_config import CONFIG_FILE_PATH_ENV_STR from smdebug.core.modes import ModeKeys -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailableForStep from smdebug.profiler.profiler_constants import DEFAULT_PREFIX from smdebug.tensorflow import ReductionConfig, SaveConfig @@ -248,10 +248,11 @@ def test_gradtape_base_reductions(out_dir): """ Test reduction config """ + reductions = ["min", "max", "mean", "std", "sum", "prod"] helper_keras_gradtape( trial_dir=out_dir, include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions), ) tr = create_trial_fast_refresh(out_dir) weight_name = tr.tensor_names(collection=CollectionKeys.WEIGHTS)[0] @@ -260,7 +261,7 @@ def test_gradtape_base_reductions(out_dir): assert False except TensorUnavailableForStep: assert tr.tensor(weight_name).reduction_value(0, "l1") is not None - assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len( + assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len( ALLOWED_NORMS ) @@ -379,7 +380,9 @@ def test_gradtape_include_collections(out_dir): out_dir, save_config=save_config, include_collections=include_collections, - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"] + ), ) helper_keras_gradtape(out_dir, hook=hook) @@ -526,10 +529,11 @@ def test_keras_fit_shapes(out_dir): @pytest.mark.slow def test_base_reductions(out_dir, tf_eager_mode): + reductions = ["min", "max", "mean", "std", "sum", "prod"] helper_keras_fit( trial_dir=out_dir, include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.METRICS, CollectionKeys.LOSSES], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=reductions), run_eagerly=tf_eager_mode, ) tr = create_trial_fast_refresh(out_dir) @@ -539,7 +543,7 @@ def test_base_reductions(out_dir, tf_eager_mode): assert False except TensorUnavailableForStep: assert tr.tensor(weight_name).reduction_value(0, "l1") is not None - assert len(tr.tensor(weight_name).reduction_values(0)) == len(ALLOWED_REDUCTIONS) + len( + assert len(tr.tensor(weight_name).reduction_values(0)) == len(reductions) + len( ALLOWED_NORMS ) @@ -718,7 +722,9 @@ def test_include_collections(out_dir, tf_eager_mode): out_dir, save_config=save_config, include_collections=include_collections, - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"] + ), ) hook.get_collection("custom_optimizer_variables").include("Adam") helper_keras_fit( @@ -755,7 +761,9 @@ def test_include_only_custom_collection(out_dir, tf_eager_mode): out_dir, save_config=save_config, include_collections=include_collections, - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "sum", "prod"] + ), ) hook.get_collection("custom_optimizer_variables").include("Adam") helper_keras_fit( diff --git a/tests/tensorflow2/test_keras_mirrored.py b/tests/tensorflow2/test_keras_mirrored.py index d7dd2cb1d..8cadadf42 100644 --- a/tests/tensorflow2/test_keras_mirrored.py +++ b/tests/tensorflow2/test_keras_mirrored.py @@ -20,7 +20,7 @@ from smdebug.core.access_layer import has_training_ended from smdebug.core.collection import CollectionKeys from smdebug.core.modes import ModeKeys -from smdebug.core.reduction_config import ALLOWED_NORMS, ALLOWED_REDUCTIONS +from smdebug.core.reduction_config import ALLOWED_NORMS from smdebug.exceptions import TensorUnavailable, TensorUnavailableForStep from smdebug.tensorflow import ReductionConfig, SaveConfig from smdebug.tensorflow.keras import KerasHook @@ -325,7 +325,9 @@ def test_base_reductions(out_dir, tf_eager_mode): CollectionKeys.METRICS, CollectionKeys.LOSSES, ], - reduction_config=ReductionConfig(norms=ALLOWED_NORMS, reductions=ALLOWED_REDUCTIONS), + reduction_config=ReductionConfig( + norms=ALLOWED_NORMS, reductions=["min", "max", "mean", "std", "variance", "sum", "prod"] + ), steps=["train"], eager=tf_eager_mode, )