diff --git a/smdebug/_version.py b/smdebug/_version.py index e13bd590c..ad53abcd6 100644 --- a/smdebug/_version.py +++ b/smdebug/_version.py @@ -1 +1 @@ -__version__ = "1.0.8" +__version__ = "1.0.8c0" diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 4064b3d28..de0ea3ab3 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -563,6 +563,7 @@ def _increment_step(self): self._write_state() self.step += 1 + self.logger.debug(f"Smdebug: Increment Step: {self.step}") self.mode_steps[self.mode] += 1 self.written_tensor_name_for_step.clear() @@ -592,6 +593,7 @@ def should_save_tensor_or_collection(self, tensor_name: str, collection_name: st def _write_state(self): if self.state_store.is_checkpoint_updated(): + self.logger.debug(f"smdebug: writing checkpoint") current_state = dict() current_state[TRAINING_RUN] = self.training_run current_state[LATEST_GLOBAL_STEP_SAVED] = self.last_saved_step diff --git a/smdebug/core/writer.py b/smdebug/core/writer.py index 29ed7daaa..b55593618 100644 --- a/smdebug/core/writer.py +++ b/smdebug/core/writer.py @@ -88,6 +88,7 @@ def __init__( self.flush_secs = flush_secs self.verbose = verbose self.write_checksum = write_checksum + self.logger = get_logger() self._proto_writer = None @@ -155,6 +156,7 @@ def write_tensor( tag = tname tensor_proto = make_tensor_proto(nparray_data=value, tag=tag) s = Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor_proto)]) + self.logger.debug(f"smdebug: writing tensor: {tname}") if write_index: self.proto_writer.write_summary_with_index( s, self.step, tname, mode, mode_step, timestamp=timestamp diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 614669616..133bded1d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -166,6 +166,7 @@ def register_model(self, model): # This function is called by the hook in the AWS TF codebase # It attaches a hook to every layer of the model to capture # layer values + self.logger.debug("Smdebug Register Model") self.model = model self._wrap_model_with_input_output_saver() self.has_registered_model = True @@ -553,6 +554,7 @@ def _save_model_inputs_and_outputs_helper(self, collection_key, tensors_to_save, def save_smdebug_logs(self, logs): if logs is None: return + self.logger.debug(f"Length of logs shared to debugger: {len(logs)}") for key in logs: if SMDEBUG_PREFIX in key: @@ -725,6 +727,7 @@ def _remove_fetches_and_callbacks(self, mode): def _prepare_collections_for_tf2(self): self._prepare_collections() + self.logger.debug("SMDEBUG: Preparing Collections") if self.has_default_hook_configuration(): # wrapping the model is only supported if the hook does not have the default hook configuration self._unwrap_model_with_input_output_saver() @@ -750,6 +753,7 @@ def _on_any_mode_begin(self, mode): self.graph = tf.get_default_graph() self.set_mode(mode) + self.logger.debug(f"SMDEBUG: Set mode: {mode}") if self.prepared_collections is False and is_tf_version_2_3_x(): # Addresses ordering issues in TF 2.3.0 @@ -762,9 +766,11 @@ def _on_any_mode_begin(self, mode): self.callable_cache.change_mode() def on_train_begin(self, logs=None): + self.logger.debug("SMDEBUG: Entering On Train Begin") self._on_any_mode_begin(ModeKeys.TRAIN) def on_test_begin(self, logs=None): + self.logger.debug("SMDEBUG: Entering On Test Begin") self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): @@ -811,6 +817,7 @@ def on_predict_begin(self, logs=None): def _wrap_model_with_input_output_saver(self): if self.has_registered_model: return + self.logger.debug("SMDEBUG: Attach Model Layers With Hooks") for layer in self.model.layers: layer._hooks = [] layer.call = get_layer_call_fn(layer) @@ -822,10 +829,12 @@ def _wrap_model_with_input_output_saver(self): def _unwrap_model_with_input_output_saver(self): if self.has_registered_model is False: return + self.logger.debug("SMDEBUG: Remove Hooks From Model Layers") for layer in self.model.layers: layer.call = layer.old_call def _on_any_batch_begin(self, batch, mode, logs=None): + self.logger.debug(f"SMDEBUG: On Any Batch Begin: {mode}") self.start = time.time() if self._is_not_supported(): return