From 37b70f027a0503cbc91f23f03fc8f265b3b937f7 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:32:23 -0800 Subject: [PATCH 01/97] add changes for enabling profiling for native tf2 training --- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py new file mode 100644 index 000000000..e69de29bb From bb67bb0e485c5467f26019bbe71efc68022e9ad4 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:42:55 -0800 Subject: [PATCH 02/97] add changes for enabling profiling in the native tf2 training --- smdebug/tensorflow/keras.py | 169 +++++++++++++++++- .../tensorflow2/test_native_tf2_profiling.py | 144 +++++++++++++++ 2 files changed, 305 insertions(+), 8 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 49a87419e..1c0ee5805 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -121,6 +121,9 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False + # this flag is used to handle step number increment in the tensorflow native training + # it indicated to profiling for tensorflow2 native training + self.profiling_native_training = False if python_profiler: atexit.register(python_profiler.stop_profiling, StepPhase.END) @@ -1099,7 +1102,9 @@ def unwrap(func): def close(self): self._cleanup() + print('\nStep Number in the close function: ', self.step) if python_profiler: + print('python profiling for end of last train step to end of training') python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1143,7 +1148,9 @@ def run(*args, **kwargs): self._prepare_collections() self.prepared_collections = True - self._increment_step() + if not self.profiling_native_training: + self._increment_step() + print('\nStep number in the push tape: ', self.step) if self._get_collections_to_save_for_step(): self._initialize_writers() @@ -1233,6 +1240,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step + print('\nStep number in the pop tape: ', self.step) return run @@ -1259,13 +1267,13 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # Disable python profiling, because now we are starting wrap tape. - if python_profiler: - python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(self.mode), - end_step=0, - ) + # # Disable python profiling, because now we are starting wrap tape. + # if python_profiler: + # python_profiler.stop_profiling( + # StepPhase.STEP_START, + # end_mode=mode_keys_to_python_profile_mode(self.mode), + # end_step=0, + # ) from tensorflow.python.eager.backprop import GradientTape @@ -1295,3 +1303,148 @@ def record_tensor_value(self, tensor_name, tensor_value): if self._is_collection_being_saved_for_step(CollectionKeys.METRICS): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) + + def start_profiling_start_train_batch(self): + print('Start profiling train batch') + + self.start = time.time() + # santiy check + if self._is_not_supported(): + return + # set mode to TRAIN + self.set_mode(ModeKeys.TRAIN) + + self.profiling_native_training = True + if self.profiling_native_training: + self._increment_step() + # if self.step_incremented_in_on_train_begin is False: + # self._increment_step() + # else: + # self.step_incremented_in_on_train_begin = False + + print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) + + # load the profiler config + self.profiler_config_parser.load_config() + + + # Dataloader + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ) and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_START_FLAG_FILENAME + ): + print('Dataloader profiling') + self.is_dataloader_profiling = True + elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + self.is_dataloader_profiling = False + + # python profiling + if python_profiler: + print('Stop python profiling in start train batch') + python_profiler.stop_profiling( + StepPhase.STEP_START, + end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + end_step=self.mode_steps[ModeKeys.TRAIN], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + print('Start python profiling in start train batch') + python_profiler.start_profiling( + StepPhase.STEP_START, + start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + start_step=self.mode_steps[ModeKeys.TRAIN], + ) + + # detail profiling + if is_profiler_supported_for_tf_version(): + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + if not self.is_detailed_profiling: + self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( + self.profiler_config_parser.config.local_path, + "tensorflow", + self.mode_steps[ModeKeys.TRAIN], + ) + self.logger.info( + f"Enabling TF profiler on step: = {self.mode_steps[ModeKeys.TRAIN]}" + ) + if not self.warm_up_completed: + # warming up profiler before it will be profiling. + self.tf_profiler.warmup() + self.warm_up_completed = True + self.tf_profiler.start(self._log_dir) + self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS + self.is_detailed_profiling = True + elif self.is_detailed_profiling: + self.logger.info( + f"Disabling TF profiler on step: ={self.mode_steps[ModeKeys.TRAIN]}" + ) + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + def start_profiling_end_train_batch(self): + print('End profiling train batch') + print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) + # sanity check + if self._is_not_supported(): + return + + self.record_trace_events( + training_phase="Step:" + str(ModeKeys.TRAIN), + op_name="Step:" + str(ModeKeys.TRAIN), + phase="X", + timestamp=self.start, # this is start time for step + duration=time.time() - self.start, + pid=os.getpid(), + step_num=str(self.mode_steps[ModeKeys.TRAIN]), + ) + + if python_profiler: + print('Stop python profiling in end train batch') + python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + end_step=self.mode_steps[ModeKeys.TRAIN], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + print('Start python profiling in end train batch') + python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + start_step=self.mode_steps[ModeKeys.TRAIN], + ) + + def stop_profiling_end_of_training(self): + print('\nEnd of training!') + print('\nStep Number at the end of training: ', self.step) + + # Alternatively, use self.close to close the python profiling + self.close() + + if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + print('Stop Dataloader profiling') + self.is_dataloader_profiling = False + + if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: + self.logger.info("Disabling profiler, reached end of training.") + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + self.profiling_native_training = False diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index e69de29bb..16b9e121a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -0,0 +1,144 @@ +# Standard Library +import os +from pathlib import Path + +# Third Party +import tensorflow as tf +import pytest +# from tests.tensorflow2.test_keras import helper_keras_fit + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.profiler.profiler_config_parser import ProfilerConfigParser +from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def create_hook(trial_dir): + hook = smd.KerasHook(trial_dir, save_all=True) + return hook + + +def create_model(): + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + return model + +def helper_native_tf2(): + + +def test_gradtape_tf_function(out_dir): + def get_grads(images, labels): + # with tf.GradientTape() as tape: + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(64) + model = create_model() + hook = create_hook(out_dir) + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with hook.wrap_tape(tf.GradientTape()) as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + model.save(out_dir, save_format="tf") + + + trial = smd.create_trial(out_dir) + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ + "weights/dense/kernel:0", + "weights/dense_1/kernel:0", + ] + assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ + "weights/dense/bias:0", + "weights/dense_1/bias:0", + ] + assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] + + +def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) + hook.close() + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 \ No newline at end of file From 32cb5fdc41e962a9d289e3884b6d00add3314471 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 11 Jan 2021 11:08:36 -0800 Subject: [PATCH 03/97] add tests --- smdebug/tensorflow/keras.py | 31 +- ...filer_cprofiler_config_parser_by_step.json | 7 + ...er_pyinstrument_config_parser_by_step.json | 7 + .../tensorflow2/test_native_tf2_profiling.py | 349 +++++++++++++++++- 4 files changed, 358 insertions(+), 36 deletions(-) create mode 100644 tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json create mode 100644 tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1c0ee5805..a87d3a903 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -121,8 +121,8 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag is used to handle step number increment in the tensorflow native training - # it indicated to profiling for tensorflow2 native training + # this flag is used to handle step number increment in the tensorflow native training when profiler is on + # it indicates to profiling for tensorflow2 native training self.profiling_native_training = False if python_profiler: @@ -1044,7 +1044,7 @@ def wrap_optimizer(self, optimizer): if isinstance(optimizer, tf.train.Optimizer): optimizer = self._wrap_apply_gradients(optimizer) elif isinstance(optimizer, tf.keras.optimizers.Optimizer) or is_keras_optimizer(optimizer): - # either subclasse of optimizerV2 class in tf.keras + # either subclass of optimizerV2 class in tf.keras # or keras.optimizers.Optimizer original_get_grads = optimizer.__class__.get_gradients @@ -1305,30 +1305,25 @@ def record_tensor_value(self, tensor_name, tensor_value): self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) def start_profiling_start_train_batch(self): + """ + Enabling profiler at the start of train batch when native tf2 training is used. + """ print('Start profiling train batch') self.start = time.time() - # santiy check + if self._is_not_supported(): return - # set mode to TRAIN self.set_mode(ModeKeys.TRAIN) self.profiling_native_training = True if self.profiling_native_training: self._increment_step() - # if self.step_incremented_in_on_train_begin is False: - # self._increment_step() - # else: - # self.step_incremented_in_on_train_begin = False print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) - # load the profiler config self.profiler_config_parser.load_config() - - # Dataloader if self.profiler_config_parser.should_save_metrics( MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] ) and self.profiler_config_parser.write_tf_dataloader_flag( @@ -1341,7 +1336,6 @@ def start_profiling_start_train_batch(self): ): self.is_dataloader_profiling = False - # python profiling if python_profiler: print('Stop python profiling in start train batch') python_profiler.stop_profiling( @@ -1359,7 +1353,6 @@ def start_profiling_start_train_batch(self): start_step=self.mode_steps[ModeKeys.TRAIN], ) - # detail profiling if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] @@ -1392,9 +1385,12 @@ def start_profiling_start_train_batch(self): self.is_detailed_profiling = False def start_profiling_end_train_batch(self): + """ + Enabling profiler at the end of train batch when native tf2 training is used. + """ print('End profiling train batch') print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) - # sanity check + if self._is_not_supported(): return @@ -1426,10 +1422,13 @@ def start_profiling_end_train_batch(self): ) def stop_profiling_end_of_training(self): + """ + Stop profiler at the end of training when native tf2 training is used. + """ print('\nEnd of training!') print('\nStep Number at the end of training: ', self.step) - # Alternatively, use self.close to close the python profiling + # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json new file mode 100644 index 000000000..2ab039217 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -0,0 +1,7 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2}" + } +} \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json new file mode 100644 index 000000000..325224801 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -0,0 +1,7 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"python_profiler\": \"Pyinstrument\"}" + } +} \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 16b9e121a..3e1f08b2b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -1,19 +1,40 @@ # Standard Library import os +import json +import time +from datetime import datetime from pathlib import Path +import pstats # Third Party import tensorflow as tf import pytest -# from tests.tensorflow2.test_keras import helper_keras_fit # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys +from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents from smdebug.tensorflow import KerasHook as Hook +from smdebug.profiler.profiler_constants import ( + CONVERT_TO_MICROSECS, + DEFAULT_PREFIX, + TRACE_DIRECTORY_FORMAT, + CPROFILE_NAME, + CPROFILE_STATS_FILENAME, + PYINSTRUMENT_HTML_FILENAME, + PYINSTRUMENT_JSON_FILENAME, + PYINSTRUMENT_NAME, +) +from smdebug.profiler.python_profiler import PythonProfiler +from smdebug.profiler.python_profiler import ( + PyinstrumentPythonProfiler, + cProfilePythonProfiler, + cProfileTimer, +) +from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -22,6 +43,20 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): return ProfilerConfigParser() +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + @pytest.fixture() def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") @@ -29,9 +64,24 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -def create_hook(trial_dir): - hook = smd.KerasHook(trial_dir, save_all=True) - return hook +@pytest.fixture +def test_framework(): + return "test-framework" + + +@pytest.fixture() +def cprofile_python_profiler(out_dir, test_framework): + return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) + + +@pytest.fixture() +def pyinstrument_python_profiler(out_dir, test_framework): + return PyinstrumentPythonProfiler(out_dir, test_framework) + + +@pytest.fixture() +def framework_dir(out_dir, test_framework): + return "{0}/framework/{1}".format(out_dir, test_framework) def create_model(): @@ -46,12 +96,10 @@ def create_model(): ) return model -def helper_native_tf2(): +def helper_native_tf2_profiler_debugger(trial_dir, hook): -def test_gradtape_tf_function(out_dir): def get_grads(images, labels): - # with tf.GradientTape() as tape: return model(images, training=True) @tf.function @@ -63,32 +111,29 @@ def train_step(images, labels): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(64) + dataset = dataset.shuffle(1000).batch(128) model = create_model() - hook = create_hook(out_dir) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - dataset_labels = labels labels = tf.one_hot(labels, depth=10) hook.start_profiling_start_train_batch() with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(out_dir, save_format="tf") - + model.save(trial_dir, save_format="tf") - trial = smd.create_trial(out_dir) + trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ "weights/dense/kernel:0", @@ -109,7 +154,40 @@ def train_step(images, labels): assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): +def helper_native_tf2_profiler(trial_dir, hook): + + def get_grads(images, labels): + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(128) + model = create_model() + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with tf.GradientTape() as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): """ This test executes a TF2 native training script, enables detailed TF profiling by step, and verifies the number of events. @@ -117,8 +195,7 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) - hook.close() + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -141,4 +218,236 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 \ No newline at end of file + assert num_trace_events >= 230 + + +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +# @pytest.mark.parametrize("use_pyinstrument", [False, True]) +# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) +def test_native_python_profiling_cprofiler( + out_dir, tf2_python_cprofiler_config_parser_by_step +): + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + config = tf2_python_cprofiler_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + config = tf2_python_pyinstrument_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.profiler_name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is meant to test successful creation of the timeline file according to file path specification. + $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ + {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict \ No newline at end of file From 8e1d6f2aff92d786b4eea46fa287537376d7e3ee Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 12:05:19 -0800 Subject: [PATCH 04/97] add python profiler as attr for kerashook --- smdebug/tensorflow/keras.py | 57 ++++++++++++------- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 6 +- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index a87d3a903..009efe230 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,7 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) +print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -125,7 +126,11 @@ def __init__( # it indicates to profiling for tensorflow2 native training self.profiling_native_training = False - if python_profiler: + self.python_profiler = python_profiler + print('\nObject inside: ', self.python_profiler) + + if self.python_profiler: + print('exit stop profiling object inside: ', self.python_profiler, self.step) atexit.register(python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): @@ -753,13 +758,13 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], ) - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -837,8 +842,8 @@ def _on_any_batch_begin(self, batch, mode, logs=None): ): self.is_dataloader_profiling = False - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], @@ -846,7 +851,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -1007,8 +1012,8 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], @@ -1016,7 +1021,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -1103,14 +1108,16 @@ def unwrap(func): def close(self): self._cleanup() print('\nStep Number in the close function: ', self.step) - if python_profiler: + if self.python_profiler: print('python profiling for end of last train step to end of training') - python_profiler.start_profiling( + print('close start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) + def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1308,7 +1315,15 @@ def start_profiling_start_train_batch(self): """ Enabling profiler at the start of train batch when native tf2 training is used. """ - print('Start profiling train batch') + + # print('Start profiling train batch') + # a = PythonProfiler.get_python_profiler(profiler_config_parser.config, 'tensorflow') + # print('\nProfiler enabled: ', a) + # + # print('\nname: ', self.profiler_config_parser.config.python_profiling_config.name) + # print('\npython profiling: ', self.python_profiler) + # print('\nstart_step: ', self.profiler_config_parser.config.python_profiling_config.start_step) + # print('\nnum_steps: ', self.profiler_config_parser.config.python_profiling_config.num_steps) self.start = time.time() @@ -1336,9 +1351,10 @@ def start_profiling_start_train_batch(self): ): self.is_dataloader_profiling = False - if python_profiler: + if self.python_profiler: print('Stop python profiling in start train batch') - python_profiler.stop_profiling( + print('start train batch stop profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), end_step=self.mode_steps[ModeKeys.TRAIN], @@ -1347,7 +1363,8 @@ def start_profiling_start_train_batch(self): MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] ): print('Start python profiling in start train batch') - python_profiler.start_profiling( + print('start train batch start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), start_step=self.mode_steps[ModeKeys.TRAIN], @@ -1404,9 +1421,10 @@ def start_profiling_end_train_batch(self): step_num=str(self.mode_steps[ModeKeys.TRAIN]), ) - if python_profiler: + if self.python_profiler: print('Stop python profiling in end train batch') - python_profiler.stop_profiling( + print('end train batch stop profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), end_step=self.mode_steps[ModeKeys.TRAIN], @@ -1415,7 +1433,8 @@ def start_profiling_end_train_batch(self): MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] ): print('Start python profiling in end train batch') - python_profiler.start_profiling( + print('end train batch start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), start_step=self.mode_steps[ModeKeys.TRAIN], diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 325224801..02bc8c0d3 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"python_profiler\": \"Pyinstrument\"}" + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" } } \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 3e1f08b2b..9ba8dd60b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -173,6 +173,8 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + + # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -338,6 +340,7 @@ def test_native_python_profiling_cprofiler( print('\nnum_steps: ', config.python_profiling_config.num_steps) python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -347,6 +350,7 @@ def test_native_python_profiling_cprofiler( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. @@ -384,7 +388,7 @@ def test_native_python_profiling_pyinstrument( print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps From 7f8b51e04f7b731baad8b83fa335eeb3e3468d90 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 16:01:59 -0800 Subject: [PATCH 05/97] add tests --- smdebug/tensorflow/keras.py | 2 +- ...filer_cprofiler_config_parser_by_step.json | 2 +- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 34 ++++++++++++------- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 009efe230..d31017179 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -131,7 +131,7 @@ def __init__( if self.python_profiler: print('exit stop profiling object inside: ', self.python_profiler, self.step) - atexit.register(python_profiler.stop_profiling, StepPhase.END) + atexit.register(self.python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): if self.distribution_strategy is None: diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index 2ab039217..d568b471c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2}" + "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } } \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 02bc8c0d3..c1c45594c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" + "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 9ba8dd60b..9a7416be3 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path import pstats +import atexit # Third Party import tensorflow as tf @@ -84,6 +85,18 @@ def framework_dir(out_dir, test_framework): return "{0}/framework/{1}".format(out_dir, test_framework) +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig + python_profiler = None + if profiler_config_parser.profiling_enabled: + config = profiler_config_parser.config + if config.python_profiling_config.is_enabled(): + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + python_profiler.start_profiling(StepPhase.START) + atexit.register(python_profiler.stop_profiling, StepPhase.END) + return profiler_config_parser, python_profiler + + def create_model(): model = tf.keras.models.Sequential( [ @@ -131,7 +144,7 @@ def train_step(images, labels): hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(trial_dir, save_format="tf") + # model.save(trial_dir, save_format="tf") trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] @@ -173,8 +186,6 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - - # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -196,7 +207,7 @@ def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parse """ assert tf2_profiler_config_parser_by_step.profiling_enabled - hook = Hook(out_dir=out_dir) + hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -333,13 +344,13 @@ def test_native_python_profiling_cprofiler( ): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - config = tf2_python_cprofiler_config_parser_by_step.config + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + + config = profiler_config_parser.config print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) print('\nname: ', config.python_profiling_config.name) print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps @@ -382,13 +393,9 @@ def test_native_python_profiling_pyinstrument( ): assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - config = tf2_python_pyinstrument_config_parser_by_step.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.profiler_name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -398,6 +405,7 @@ def test_native_python_profiling_pyinstrument( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. From ac60d3aa24ae3c2e0354be3a5365e88b8869bf91 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 14:16:24 -0800 Subject: [PATCH 06/97] update profiler for native tf training --- smdebug/tensorflow/keras.py | 138 +++--- ...iler_all_params_config_parser_by_step.json | 8 + ...filer_cprofiler_config_parser_by_step.json | 2 +- ...er_pyinstrument_config_parser_by_step.json | 2 +- ...ofiling.py => test_native_tf2_profiler.py} | 397 +++++++++--------- 5 files changed, 271 insertions(+), 276 deletions(-) create mode 100644 tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json rename tests/profiler/tensorflow2/{test_native_tf2_profiling.py => test_native_tf2_profiler.py} (57%) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index d31017179..dbfc8bf1d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,7 +61,7 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -print('prezero-step start profiling object outside: ', python_profiler) +# print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -106,6 +106,7 @@ def __init__( # Profiling vars self.tf_profiler = None if is_profiler_supported_for_tf_version(): + # Third Party from tensorflow.python.profiler import profiler_v2 as tf_profiler self.tf_profiler = tf_profiler @@ -114,6 +115,7 @@ def __init__( self.is_dataloader_profiling = False self.tf_profiler_start_time_in_micros = 0 self.warm_up_completed = False + self.python_profiler = python_profiler # supports_tf_logs property was introduced in TF 2.3.0 # it indicates to the framework that the callback is not # limited to reading only numpy logs @@ -122,15 +124,10 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag is used to handle step number increment in the tensorflow native training when profiler is on - # it indicates to profiling for tensorflow2 native training - self.profiling_native_training = False - - self.python_profiler = python_profiler - print('\nObject inside: ', self.python_profiler) + # this flag indicates to debugging for tensorflow2 native training + self.debugger_native_training = False if self.python_profiler: - print('exit stop profiling object inside: ', self.python_profiler, self.step) atexit.register(self.python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): @@ -340,6 +337,7 @@ def _create_tensors_for_matching_collections( def _get_distributed_model(self, mode): # not available in tf 1.13, code shouldn't reach here for 1.13 # because of _is_not_supported + # Third Party from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) @@ -1107,16 +1105,15 @@ def unwrap(func): def close(self): self._cleanup() - print('\nStep Number in the close function: ', self.step) + print("\nStep Number in the close function: ", self.step) if self.python_profiler: - print('python profiling for end of last train step to end of training') - print('close start profiling object inside: ', self.python_profiler, self.step) + # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - + self.debugger_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1155,11 +1152,11 @@ def run(*args, **kwargs): self._prepare_collections() self.prepared_collections = True - if not self.profiling_native_training: - self._increment_step() - print('\nStep number in the push tape: ', self.step) + self._increment_step() + print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): + # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() if self.last_saved_step is not None and self._exported_collections is False: @@ -1247,7 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - print('\nStep number in the pop tape: ', self.step) + print("\nStep number in the pop tape: ", self.step) return run @@ -1274,16 +1271,12 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # # Disable python profiling, because now we are starting wrap tape. - # if python_profiler: - # python_profiler.stop_profiling( - # StepPhase.STEP_START, - # end_mode=mode_keys_to_python_profile_mode(self.mode), - # end_step=0, - # ) - + # Third Party from tensorflow.python.eager.backprop import GradientTape + self.debugger_native_training = True + self.set_mode(ModeKeys.TRAIN) + if isinstance(tape, GradientTape): # unwrap tape before wrapping new tape to avoid recursive wrap tapes if self.tape: @@ -1311,78 +1304,67 @@ def record_tensor_value(self, tensor_name, tensor_value): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) - def start_profiling_start_train_batch(self): + def profiling_start_batch(self, mode): """ Enabling profiler at the start of train batch when native tf2 training is used. """ - - # print('Start profiling train batch') - # a = PythonProfiler.get_python_profiler(profiler_config_parser.config, 'tensorflow') - # print('\nProfiler enabled: ', a) - # - # print('\nname: ', self.profiler_config_parser.config.python_profiling_config.name) - # print('\npython profiling: ', self.python_profiler) - # print('\nstart_step: ', self.profiler_config_parser.config.python_profiling_config.start_step) - # print('\nnum_steps: ', self.profiler_config_parser.config.python_profiling_config.num_steps) - self.start = time.time() if self._is_not_supported(): return - self.set_mode(ModeKeys.TRAIN) - self.profiling_native_training = True - if self.profiling_native_training: - self._increment_step() + self.set_mode(mode) - print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) + if not self.debugger_native_training: + self.step += 1 + self.mode_steps[self.mode] += 1 + # Increment Global step number irrespective of what mode it is + if self.mode != ModeKeys.GLOBAL: + self.mode_steps[ModeKeys.GLOBAL] = self.step + + print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] ) and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_START_FLAG_FILENAME ): - print('Dataloader profiling') self.is_dataloader_profiling = True elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): self.is_dataloader_profiling = False if self.python_profiler: - print('Stop python profiling in start train batch') - print('start train batch stop profiling object inside: ', self.python_profiler, self.step) + # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - end_step=self.mode_steps[ModeKeys.TRAIN], + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], ) if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - print('Start python profiling in start train batch') - print('start train batch start profiling object inside: ', self.python_profiler, self.step) + # print("Start python profiling in start train batch") self.python_profiler.start_profiling( StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - start_step=self.mode_steps[ModeKeys.TRAIN], + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], ) if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( self.profiler_config_parser.config.local_path, "tensorflow", - self.mode_steps[ModeKeys.TRAIN], - ) - self.logger.info( - f"Enabling TF profiler on step: = {self.mode_steps[ModeKeys.TRAIN]}" + self.mode_steps[mode], ) + self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") if not self.warm_up_completed: # warming up profiler before it will be profiling. self.tf_profiler.warmup() @@ -1391,9 +1373,7 @@ def start_profiling_start_train_batch(self): self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS self.is_detailed_profiling = True elif self.is_detailed_profiling: - self.logger.info( - f"Disabling TF profiler on step: ={self.mode_steps[ModeKeys.TRAIN]}" - ) + self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") stop_tf_profiler( tf_profiler=self.tf_profiler, log_dir=self._log_dir, @@ -1401,59 +1381,55 @@ def start_profiling_start_train_batch(self): ) self.is_detailed_profiling = False - def start_profiling_end_train_batch(self): + def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - print('End profiling train batch') - print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) + print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return self.record_trace_events( - training_phase="Step:" + str(ModeKeys.TRAIN), - op_name="Step:" + str(ModeKeys.TRAIN), + training_phase="Step:" + str(mode), + op_name="Step:" + str(mode), phase="X", timestamp=self.start, # this is start time for step duration=time.time() - self.start, pid=os.getpid(), - step_num=str(self.mode_steps[ModeKeys.TRAIN]), + step_num=str(self.mode_steps[mode]), ) if self.python_profiler: - print('Stop python profiling in end train batch') - print('end train batch stop profiling object inside: ', self.python_profiler, self.step) + # print("Stop python profiling in end train batch") self.python_profiler.stop_profiling( StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - end_step=self.mode_steps[ModeKeys.TRAIN], + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], ) if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - print('Start python profiling in end train batch') - print('end train batch start profiling object inside: ', self.python_profiler, self.step) + # print("Start python profiling in end train batch") self.python_profiler.start_profiling( StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - start_step=self.mode_steps[ModeKeys.TRAIN], + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], ) - def stop_profiling_end_of_training(self): + def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - print('\nEnd of training!') - print('\nStep Number at the end of training: ', self.step) + print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): - print('Stop Dataloader profiling') + # print("Stop Dataloader profiling") self.is_dataloader_profiling = False if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: @@ -1464,5 +1440,3 @@ def stop_profiling_end_of_training(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False - - self.profiling_native_training = False diff --git a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json new file mode 100644 index 000000000..c119eebf8 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json @@ -0,0 +1,8 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "DetailedProfilingConfig": "{\"StartStep\": 2, \"NumSteps\": 2}", + "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" + } +} diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index d568b471c..e51c386c2 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} \ No newline at end of file +} diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index c1c45594c..53ac1485e 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } -} \ No newline at end of file +} diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py similarity index 57% rename from tests/profiler/tensorflow2/test_native_tf2_profiling.py rename to tests/profiler/tensorflow2/test_native_tf2_profiler.py index 9a7416be3..be2b6e124 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -1,41 +1,36 @@ # Standard Library -import os +import atexit import json +import os +import pstats import time from datetime import datetime from pathlib import Path -import pstats -import atexit # Third Party -import tensorflow as tf import pytest +import tensorflow as tf # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys -from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser -from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX -from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents -from smdebug.tensorflow import KerasHook as Hook from smdebug.profiler.profiler_constants import ( CONVERT_TO_MICROSECS, - DEFAULT_PREFIX, - TRACE_DIRECTORY_FORMAT, CPROFILE_NAME, CPROFILE_STATS_FILENAME, + DEFAULT_PREFIX, PYINSTRUMENT_HTML_FILENAME, PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_NAME, + TENSORBOARDTIMELINE_SUFFIX, + TRACE_DIRECTORY_FORMAT, ) +from smdebug.profiler.python_profile_utils import StepPhase from smdebug.profiler.python_profiler import PythonProfiler -from smdebug.profiler.python_profiler import ( - PyinstrumentPythonProfiler, - cProfilePythonProfiler, - cProfileTimer, -) -from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -46,14 +41,18 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @pytest.fixture() def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json" + ) monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) return ProfilerConfigParser() @pytest.fixture() def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json" + ) monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) return ProfilerConfigParser() @@ -65,24 +64,13 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -@pytest.fixture -def test_framework(): - return "test-framework" - - @pytest.fixture() -def cprofile_python_profiler(out_dir, test_framework): - return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) - - -@pytest.fixture() -def pyinstrument_python_profiler(out_dir, test_framework): - return PyinstrumentPythonProfiler(out_dir, test_framework) - - -@pytest.fixture() -def framework_dir(out_dir, test_framework): - return "{0}/framework/{1}".format(out_dir, test_framework) +def tf2_profiler_config_parser_by_step_all_params(config_folder, monkeypatch): + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_all_params_config_parser_by_step.json" + ) + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() def set_up_profiling(profilerconfig): @@ -110,65 +98,19 @@ def create_model(): return model -def helper_native_tf2_profiler_debugger(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - +def prepare_dataset(): mnist = tf.keras.datasets.mnist (x_train, y_train), _ = mnist.load_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with hook.wrap_tape(tf.GradientTape()) as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - # model.save(trial_dir, save_format="tf") - - trial = smd.create_trial(trial_dir) - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ - "weights/dense/kernel:0", - "weights/dense_1/kernel:0", - ] - assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ - "weights/dense/bias:0", - "weights/dense_1/bias:0", - ] - assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] - + dataset = dataset.shuffle(1000).batch(64) + return dataset -def helper_native_tf2_profiler(trial_dir, hook): +def helper_native_tf2_gradtape( + hook, debugger=False, python_profiler=None, start_step=None, end_step=None +): def get_grads(images, labels): return model(images, training=True) @@ -176,73 +118,51 @@ def get_grads(images, labels): def train_step(images, labels): return tf.reduce_mean(get_grads(images, labels)) - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) + dataset = prepare_dataset() model = create_model() opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + current_step = 0 n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with tf.GradientTape() as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() + if debugger: + with hook.wrap_tape(tf.GradientTape()) as tape: + hook.profiling_start_batch(mode=smd.modes.TRAIN) + logits = train_step(data, labels) + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step + assert python_profiler._start_phase == StepPhase.STEP_START + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + else: + with tf.GradientTape() as tape: + hook.profiling_start_batch(mode=smd.modes.TRAIN) + logits = train_step(data, labels) + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step + assert python_profiler._start_phase == StepPhase.STEP_START + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.profiling_end_batch(mode=smd.modes.TRAIN) + hook.profiling_end() @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and + This test executes a TF2 native training script with profiler, enables detailed TF profiling by step, and verifies the number of events. """ assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) t_events = TensorboardProfilerEvents() @@ -269,15 +189,15 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and + This test executes a TF2 native training script with profiler, enables detailed TF profiling by time, and verifies the number of events. """ assert tf2_profiler_config_parser_by_time.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) # get tensorboard timeline files files = [] @@ -302,67 +222,33 @@ def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parse # consecutive runs. Hence, the approximation in the below asserts. assert num_trace_events >= 700 + @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): +def test_native_python_profiling_cprofiler(out_dir, tf2_python_cprofiler_config_parser_by_step): """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. + This test executes a TF2 native training script with profiler, enables cprofiler by step, and + verifies the python profiling's steps and expected output files. """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -# @pytest.mark.parametrize("use_pyinstrument", [False, True]) -# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) -def test_native_python_profiling_cprofiler( - out_dir, tf2_python_cprofiler_config_parser_by_step -): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + profiler_config_parser, python_profiler = set_up_profiling( + tf2_python_cprofiler_config_parser_by_step + ) config = profiler_config_parser.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) - print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps profiler_name = CPROFILE_NAME allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) hook = Hook(out_dir=out_dir) hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape( + hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) # Test that directory and corresponding files exist. assert os.path.isdir(python_stats_dir) @@ -370,6 +256,9 @@ def test_native_python_profiling_cprofiler( for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is (end_step - start_step) * 2 + 2 - 1. assert len(stats_dirs) == (end_step - start_step) * 2 + 1 for stats_dir in stats_dirs: @@ -388,12 +277,19 @@ def test_native_python_profiling_cprofiler( assert json.load(f) +@pytest.mark.skip_if_non_eager def test_native_python_profiling_pyinstrument( out_dir, tf2_python_pyinstrument_config_parser_by_step ): + """ + This test executes a TF2 native training script with profiler, enables pyinstrument by step, and + verifies the python profiling's steps and expected output files. + """ assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) + profiler_config_parser, python_profiler = set_up_profiling( + tf2_python_pyinstrument_config_parser_by_step + ) config = profiler_config_parser.config start_step = config.python_profiling_config.start_step @@ -402,11 +298,13 @@ def test_native_python_profiling_pyinstrument( profiler_name = PYINSTRUMENT_NAME allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) hook = Hook(out_dir=out_dir) hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape( + hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) # Test that directory and corresponding files exist. assert os.path.isdir(python_stats_dir) @@ -414,6 +312,9 @@ def test_native_python_profiling_pyinstrument( for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is (end_step - start_step) * 2 + 2 - 1. assert len(stats_dirs) == (end_step - start_step) * 2 + 1 for stats_dir in stats_dirs: @@ -432,17 +333,16 @@ def test_native_python_profiling_pyinstrument( assert json.load(f) +@pytest.mark.skip_if_non_eager def test_create_timeline_file(simple_profiler_config_parser, out_dir): """ - This test is meant to test successful creation of the timeline file according to file path specification. - $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ - {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + This test is to test the creation of the timeline file according to file path specification. It reads backs the file contents to make sure it is in valid JSON format. """ assert simple_profiler_config_parser.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) files = [] for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): @@ -462,4 +362,117 @@ def test_create_timeline_file(simple_profiler_config_parser, out_dir): with open(files[0]) as timeline_file: events_dict = json.load(timeline_file) - assert events_dict \ No newline at end of file + assert events_dict + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_debugger_all_params( + tf2_profiler_config_parser_by_step_all_params, out_dir +): + """ + This test executes a TF2 native training script with debugger and profiler, enables detailed TF profiling, python + profiling by step. + """ + assert tf2_profiler_config_parser_by_step_all_params.profiling_enabled + + profiler_config_parser, python_profiler = set_up_profiling( + tf2_profiler_config_parser_by_step_all_params + ) + + config = profiler_config_parser.config + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) + + hook = Hook(out_dir=out_dir, save_all=True) + hook.python_profiler = python_profiler + helper_native_tf2_gradtape(hook=hook, debugger=True) + + # Verifying python profiling related files. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + # Verifying detailed TF profiling. + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path( + tf2_profiler_config_parser_by_step_all_params.config.local_path + "/framework" + ).rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + # Verifying timeline files. + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict + + # Verifying tensor names. + trial = smd.create_trial(out_dir) + assert len(trial.steps()) > 0, "Nothing saved at any step." + assert len(trial.tensor_names()) > 0, "Tensors were not saved." + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 + assert trial.tensor_names(collection="optimizer_variables") == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] From d40de5e616e70ea8cc4688af2212ef249d9eabc7 Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 15:25:40 -0700 Subject: [PATCH 07/97] Modify distributed_training_utils.py import for TF 2.4 (#422) --- smdebug/tensorflow/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index af9a0e901..11d700dfc 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -427,6 +427,10 @@ def is_tf_version_greater_than_2_4_x(): return version.parse("2.4.0") <= TF_VERSION +def is_tf_version_greater_than_2_4_x(): + return version.parse("2.4.0") <= version.parse(tf.__version__) + + def is_profiler_supported_for_tf_version(): # Profiler Support Added For TF Versions 2.2.0 And Greater return version.parse("2.2.0") <= TF_VERSION From 7cde99eb641c3ccfea9ab774ae08d057a97067a4 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 19:48:15 -0800 Subject: [PATCH 08/97] remove print statement --- smdebug/tensorflow/keras.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index dbfc8bf1d..439ee09a8 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1105,7 +1105,7 @@ def unwrap(func): def close(self): self._cleanup() - print("\nStep Number in the close function: ", self.step) + # print("\nStep Number in the close function: ", self.step) if self.python_profiler: # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( @@ -1153,7 +1153,7 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - print("\nStep number in the push tape: ", self.step) + # print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1244,7 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - print("\nStep number in the pop tape: ", self.step) + # print("\nStep number in the pop tape: ", self.step) return run @@ -1322,7 +1322,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - print("Step Number in start train batch: ", self.mode_steps[mode]) + # print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1385,7 +1385,7 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - print("Step Number in end train batch: ", self.mode_steps[mode]) + # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1421,7 +1421,7 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - print("Step Number at the end of training: ", self.step) + # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() From 3cb92b91e503f236c15a93c10fa41ef33355b488 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:32:23 -0800 Subject: [PATCH 09/97] add changes for enabling profiling for native tf2 training --- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py new file mode 100644 index 000000000..e69de29bb From f1ee1e4973fe3baa2b01303249763d4b22b6b86d Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:42:55 -0800 Subject: [PATCH 10/97] add changes for enabling profiling in the native tf2 training --- smdebug/tensorflow/keras.py | 7 +- .../tensorflow2/test_native_tf2_profiling.py | 144 ++++++++++++++++++ 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 439ee09a8..22ae7b588 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1271,7 +1271,6 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # Third Party from tensorflow.python.eager.backprop import GradientTape self.debugger_native_training = True @@ -1356,7 +1355,7 @@ def profiling_start_batch(self, mode): if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( @@ -1386,7 +1385,6 @@ def profiling_end_batch(self, mode): Enabling profiler at the end of train batch when native tf2 training is used. """ # print("Step Number in end train batch: ", self.mode_steps[mode]) - if self._is_not_supported(): return @@ -1427,7 +1425,7 @@ def profiling_end(self): self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): # print("Stop Dataloader profiling") self.is_dataloader_profiling = False @@ -1440,3 +1438,4 @@ def profiling_end(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False + diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index e69de29bb..16b9e121a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -0,0 +1,144 @@ +# Standard Library +import os +from pathlib import Path + +# Third Party +import tensorflow as tf +import pytest +# from tests.tensorflow2.test_keras import helper_keras_fit + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.profiler.profiler_config_parser import ProfilerConfigParser +from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def create_hook(trial_dir): + hook = smd.KerasHook(trial_dir, save_all=True) + return hook + + +def create_model(): + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + return model + +def helper_native_tf2(): + + +def test_gradtape_tf_function(out_dir): + def get_grads(images, labels): + # with tf.GradientTape() as tape: + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(64) + model = create_model() + hook = create_hook(out_dir) + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with hook.wrap_tape(tf.GradientTape()) as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + model.save(out_dir, save_format="tf") + + + trial = smd.create_trial(out_dir) + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ + "weights/dense/kernel:0", + "weights/dense_1/kernel:0", + ] + assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ + "weights/dense/bias:0", + "weights/dense_1/bias:0", + ] + assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] + + +def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) + hook.close() + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 \ No newline at end of file From bc0388d6f34d7ca13cf3f9ae7bac87e493da3146 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 11 Jan 2021 11:08:36 -0800 Subject: [PATCH 11/97] add tests --- ...filer_cprofiler_config_parser_by_step.json | 1 + ...er_pyinstrument_config_parser_by_step.json | 1 + .../tensorflow2/test_native_tf2_profiling.py | 349 +++++++++++++++++- 3 files changed, 331 insertions(+), 20 deletions(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index e51c386c2..f06218f77 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -5,3 +5,4 @@ "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } } + diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 53ac1485e..ad5a555f7 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -5,3 +5,4 @@ "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } + diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 16b9e121a..3e1f08b2b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -1,19 +1,40 @@ # Standard Library import os +import json +import time +from datetime import datetime from pathlib import Path +import pstats # Third Party import tensorflow as tf import pytest -# from tests.tensorflow2.test_keras import helper_keras_fit # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys +from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents from smdebug.tensorflow import KerasHook as Hook +from smdebug.profiler.profiler_constants import ( + CONVERT_TO_MICROSECS, + DEFAULT_PREFIX, + TRACE_DIRECTORY_FORMAT, + CPROFILE_NAME, + CPROFILE_STATS_FILENAME, + PYINSTRUMENT_HTML_FILENAME, + PYINSTRUMENT_JSON_FILENAME, + PYINSTRUMENT_NAME, +) +from smdebug.profiler.python_profiler import PythonProfiler +from smdebug.profiler.python_profiler import ( + PyinstrumentPythonProfiler, + cProfilePythonProfiler, + cProfileTimer, +) +from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -22,6 +43,20 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): return ProfilerConfigParser() +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + @pytest.fixture() def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") @@ -29,9 +64,24 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -def create_hook(trial_dir): - hook = smd.KerasHook(trial_dir, save_all=True) - return hook +@pytest.fixture +def test_framework(): + return "test-framework" + + +@pytest.fixture() +def cprofile_python_profiler(out_dir, test_framework): + return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) + + +@pytest.fixture() +def pyinstrument_python_profiler(out_dir, test_framework): + return PyinstrumentPythonProfiler(out_dir, test_framework) + + +@pytest.fixture() +def framework_dir(out_dir, test_framework): + return "{0}/framework/{1}".format(out_dir, test_framework) def create_model(): @@ -46,12 +96,10 @@ def create_model(): ) return model -def helper_native_tf2(): +def helper_native_tf2_profiler_debugger(trial_dir, hook): -def test_gradtape_tf_function(out_dir): def get_grads(images, labels): - # with tf.GradientTape() as tape: return model(images, training=True) @tf.function @@ -63,32 +111,29 @@ def train_step(images, labels): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(64) + dataset = dataset.shuffle(1000).batch(128) model = create_model() - hook = create_hook(out_dir) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - dataset_labels = labels labels = tf.one_hot(labels, depth=10) hook.start_profiling_start_train_batch() with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(out_dir, save_format="tf") - + model.save(trial_dir, save_format="tf") - trial = smd.create_trial(out_dir) + trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ "weights/dense/kernel:0", @@ -109,7 +154,40 @@ def train_step(images, labels): assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): +def helper_native_tf2_profiler(trial_dir, hook): + + def get_grads(images, labels): + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(128) + model = create_model() + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with tf.GradientTape() as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): """ This test executes a TF2 native training script, enables detailed TF profiling by step, and verifies the number of events. @@ -117,8 +195,7 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) - hook.close() + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -141,4 +218,236 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 \ No newline at end of file + assert num_trace_events >= 230 + + +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +# @pytest.mark.parametrize("use_pyinstrument", [False, True]) +# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) +def test_native_python_profiling_cprofiler( + out_dir, tf2_python_cprofiler_config_parser_by_step +): + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + config = tf2_python_cprofiler_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + config = tf2_python_pyinstrument_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.profiler_name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is meant to test successful creation of the timeline file according to file path specification. + $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ + {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict \ No newline at end of file From d5d583522be01f28ee70ae9f61ee75b982785fd2 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 12:05:19 -0800 Subject: [PATCH 12/97] add python profiler as attr for kerashook --- smdebug/tensorflow/keras.py | 9 ++------- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 6 +++++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 22ae7b588..e1111045b 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,7 +61,6 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -# print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -1105,9 +1104,8 @@ def unwrap(func): def close(self): self._cleanup() - # print("\nStep Number in the close function: ", self.step) + if self.python_profiler: - # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1115,6 +1113,7 @@ def close(self): ) self.debugger_native_training = False + def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1337,7 +1336,6 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: - # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1346,7 +1344,6 @@ def profiling_start_batch(self, mode): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - # print("Start python profiling in start train batch") self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(mode), @@ -1399,7 +1396,6 @@ def profiling_end_batch(self, mode): ) if self.python_profiler: - # print("Stop python profiling in end train batch") self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1408,7 +1404,6 @@ def profiling_end_batch(self, mode): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - # print("Start python profiling in end train batch") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 3e1f08b2b..9ba8dd60b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -173,6 +173,8 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + + # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -338,6 +340,7 @@ def test_native_python_profiling_cprofiler( print('\nnum_steps: ', config.python_profiling_config.num_steps) python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -347,6 +350,7 @@ def test_native_python_profiling_cprofiler( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. @@ -384,7 +388,7 @@ def test_native_python_profiling_pyinstrument( print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps From 2b44d54e767c9b56b2598eae7b14181b3e069fb6 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 16:01:59 -0800 Subject: [PATCH 13/97] add tests --- smdebug/tensorflow/keras.py | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 34 ++++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e1111045b..2b8c2a9eb 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -316,7 +316,7 @@ def _create_tensors_for_matching_collections( for t in tensor_refs: self.tensor_to_collections[t.name] = colls_with_tensor elif colls_with_tensor: - # we should only readd tensors which were already added if these are variables + # we should only read tensors which were already added if these are variables # other tensors are part of a different mode, and will cause a crash if fetched # because their input placeholders will not be passed. if any( diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 9ba8dd60b..9a7416be3 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path import pstats +import atexit # Third Party import tensorflow as tf @@ -84,6 +85,18 @@ def framework_dir(out_dir, test_framework): return "{0}/framework/{1}".format(out_dir, test_framework) +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig + python_profiler = None + if profiler_config_parser.profiling_enabled: + config = profiler_config_parser.config + if config.python_profiling_config.is_enabled(): + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + python_profiler.start_profiling(StepPhase.START) + atexit.register(python_profiler.stop_profiling, StepPhase.END) + return profiler_config_parser, python_profiler + + def create_model(): model = tf.keras.models.Sequential( [ @@ -131,7 +144,7 @@ def train_step(images, labels): hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(trial_dir, save_format="tf") + # model.save(trial_dir, save_format="tf") trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] @@ -173,8 +186,6 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - - # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -196,7 +207,7 @@ def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parse """ assert tf2_profiler_config_parser_by_step.profiling_enabled - hook = Hook(out_dir=out_dir) + hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -333,13 +344,13 @@ def test_native_python_profiling_cprofiler( ): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - config = tf2_python_cprofiler_config_parser_by_step.config + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + + config = profiler_config_parser.config print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) print('\nname: ', config.python_profiling_config.name) print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps @@ -382,13 +393,9 @@ def test_native_python_profiling_pyinstrument( ): assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - config = tf2_python_pyinstrument_config_parser_by_step.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.profiler_name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -398,6 +405,7 @@ def test_native_python_profiling_pyinstrument( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. From d77b3389fa591af16d339b32cfef104c7612c89d Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 14:16:24 -0800 Subject: [PATCH 14/97] update profiler for native tf training --- smdebug/tensorflow/keras.py | 24 +- ...filer_cprofiler_config_parser_by_step.json | 3 +- ...er_pyinstrument_config_parser_by_step.json | 1 - .../tensorflow2/test_native_tf2_profiling.py | 465 ------------------ 4 files changed, 16 insertions(+), 477 deletions(-) delete mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2b8c2a9eb..72f15e9f0 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,10 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) +<<<<<<< HEAD +======= +# print('prezero-step start profiling object outside: ', python_profiler) +>>>>>>> update profiler for native tf training class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -144,6 +148,7 @@ def _is_not_supported(self): self._hook_supported = False elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: try: +<<<<<<< HEAD if is_tf_version_greater_than_2_4_x(): # distributed_training_utils.py renamed to distributed_training_utils_v1 in tf 2.4.0 from tensorflow.python.keras.distribute.distributed_training_utils_v1 import ( @@ -154,6 +159,12 @@ def _is_not_supported(self): get_distributed_model, ) +======= + # Third Party + from tensorflow.python.keras.distribute.distributed_training_utils import ( + get_distributed_model, + ) +>>>>>>> update profiler for native tf training except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( @@ -1106,6 +1117,7 @@ def close(self): self._cleanup() if self.python_profiler: + # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1113,7 +1125,6 @@ def close(self): ) self.debugger_native_training = False - def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1152,7 +1163,6 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - # print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1243,7 +1253,6 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - # print("\nStep number in the pop tape: ", self.step) return run @@ -1320,7 +1329,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - # print("Step Number in start train batch: ", self.mode_steps[mode]) + print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1336,6 +1345,7 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: + # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1381,7 +1391,6 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1414,13 +1423,11 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - # print("Step Number at the end of training: ", self.step) - # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): # print("Stop Dataloader profiling") self.is_dataloader_profiling = False @@ -1433,4 +1440,3 @@ def profiling_end(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False - diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index f06218f77..d568b471c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,5 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} - +} \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index ad5a555f7..53ac1485e 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -5,4 +5,3 @@ "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } - diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py deleted file mode 100644 index 9a7416be3..000000000 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ /dev/null @@ -1,465 +0,0 @@ -# Standard Library -import os -import json -import time -from datetime import datetime -from pathlib import Path -import pstats -import atexit - -# Third Party -import tensorflow as tf -import pytest - -# First Party -import smdebug.tensorflow as smd -from smdebug.core.collection import CollectionKeys -from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter -from smdebug.profiler.profiler_config_parser import ProfilerConfigParser -from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX -from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents -from smdebug.tensorflow import KerasHook as Hook -from smdebug.profiler.profiler_constants import ( - CONVERT_TO_MICROSECS, - DEFAULT_PREFIX, - TRACE_DIRECTORY_FORMAT, - CPROFILE_NAME, - CPROFILE_STATS_FILENAME, - PYINSTRUMENT_HTML_FILENAME, - PYINSTRUMENT_JSON_FILENAME, - PYINSTRUMENT_NAME, -) -from smdebug.profiler.python_profiler import PythonProfiler -from smdebug.profiler.python_profiler import ( - PyinstrumentPythonProfiler, - cProfilePythonProfiler, - cProfileTimer, -) -from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase - -@pytest.fixture() -def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture -def test_framework(): - return "test-framework" - - -@pytest.fixture() -def cprofile_python_profiler(out_dir, test_framework): - return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) - - -@pytest.fixture() -def pyinstrument_python_profiler(out_dir, test_framework): - return PyinstrumentPythonProfiler(out_dir, test_framework) - - -@pytest.fixture() -def framework_dir(out_dir, test_framework): - return "{0}/framework/{1}".format(out_dir, test_framework) - - -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig - python_profiler = None - if profiler_config_parser.profiling_enabled: - config = profiler_config_parser.config - if config.python_profiling_config.is_enabled(): - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") - python_profiler.start_profiling(StepPhase.START) - atexit.register(python_profiler.stop_profiling, StepPhase.END) - return profiler_config_parser, python_profiler - - -def create_model(): - model = tf.keras.models.Sequential( - [ - # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 - tf.keras.layers.Flatten(input_shape=(28, 28, 1)), - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dropout(0.2), - tf.keras.layers.Dense(10, activation="softmax"), - ] - ) - return model - - -def helper_native_tf2_profiler_debugger(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with hook.wrap_tape(tf.GradientTape()) as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - # model.save(trial_dir, save_format="tf") - - trial = smd.create_trial(trial_dir) - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ - "weights/dense/kernel:0", - "weights/dense_1/kernel:0", - ] - assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ - "weights/dense/bias:0", - "weights/dense_1/bias:0", - ] - assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] - - -def helper_native_tf2_profiler(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with tf.GradientTape() as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -# @pytest.mark.parametrize("use_pyinstrument", [False, True]) -# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) -def test_native_python_profiling_cprofiler( - out_dir, tf2_python_cprofiler_config_parser_by_step -): - assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) - - config = profiler_config_parser.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) - print('\ntest function: ', python_profiler) - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_native_python_profiling_pyinstrument( - out_dir, tf2_python_pyinstrument_config_parser_by_step -): - assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = PYINSTRUMENT_NAME - allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_create_timeline_file(simple_profiler_config_parser, out_dir): - """ - This test is meant to test successful creation of the timeline file according to file path specification. - $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ - {$ENV_NODE_ID_4digits0padded}_pythontimeline.json - It reads backs the file contents to make sure it is in valid JSON format. - """ - assert simple_profiler_config_parser.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): - files.append(path) - - assert len(files) == 1 - - file_ts = files[0].name.split("_")[0] - folder_name = files[0].parent.name - assert folder_name == time.strftime( - TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) - ) - assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( - TRACE_DIRECTORY_FORMAT - ) - - with open(files[0]) as timeline_file: - events_dict = json.load(timeline_file) - - assert events_dict \ No newline at end of file From 428cdd9af2f36e7985e1e14d6c41ed7423177b2c Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 15:25:40 -0700 Subject: [PATCH 15/97] Modify distributed_training_utils.py import for TF 2.4 (#422) --- smdebug/tensorflow/keras.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 72f15e9f0..e9a7a9635 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,10 +61,6 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -<<<<<<< HEAD -======= -# print('prezero-step start profiling object outside: ', python_profiler) ->>>>>>> update profiler for native tf training class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -148,7 +144,6 @@ def _is_not_supported(self): self._hook_supported = False elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: try: -<<<<<<< HEAD if is_tf_version_greater_than_2_4_x(): # distributed_training_utils.py renamed to distributed_training_utils_v1 in tf 2.4.0 from tensorflow.python.keras.distribute.distributed_training_utils_v1 import ( @@ -159,12 +154,6 @@ def _is_not_supported(self): get_distributed_model, ) -======= - # Third Party - from tensorflow.python.keras.distribute.distributed_training_utils import ( - get_distributed_model, - ) ->>>>>>> update profiler for native tf training except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( From f421f1d84138c93e42c0df61b4cca6085662176f Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 16:27:58 -0700 Subject: [PATCH 16/97] Cache TF Versions (#421) --- smdebug/tensorflow/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 11d700dfc..c26247229 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -428,7 +428,7 @@ def is_tf_version_greater_than_2_4_x(): def is_tf_version_greater_than_2_4_x(): - return version.parse("2.4.0") <= version.parse(tf.__version__) + return version.parse("2.4.0") <= TF_VERSION def is_profiler_supported_for_tf_version(): From 21c4b22f47022e1693bc8eb2fb560601849d8581 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 19:48:15 -0800 Subject: [PATCH 17/97] remove print statement --- smdebug/tensorflow/keras.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e9a7a9635..4f79ea629 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1104,7 +1104,7 @@ def unwrap(func): def close(self): self._cleanup() - + # print("\nStep Number in the close function: ", self.step) if self.python_profiler: # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( @@ -1152,6 +1152,8 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() + # print("\nStep number in the push tape: ", self.step) + if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1242,6 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step + # print("\nStep number in the pop tape: ", self.step) return run @@ -1318,7 +1321,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - print("Step Number in start train batch: ", self.mode_steps[mode]) + # print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1380,6 +1383,7 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ + # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1412,6 +1416,7 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ + # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() From d11c4ac51916a1c273ce4cc83a4d9e308fecc64e Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 22:02:39 -0800 Subject: [PATCH 18/97] clean up the code --- smdebug/tensorflow/keras.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 4f79ea629..8485677a9 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1152,8 +1152,6 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - # print("\nStep number in the push tape: ", self.step) - if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1244,7 +1242,6 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - # print("\nStep number in the pop tape: ", self.step) return run @@ -1314,6 +1311,8 @@ def profiling_start_batch(self, mode): self.set_mode(mode) + # When only profiler is enabled in the native tf2 training, + # increasing the step number in the TRAIN and GLOBAL mode. if not self.debugger_native_training: self.step += 1 self.mode_steps[self.mode] += 1 @@ -1321,8 +1320,6 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - # print("Step Number in start train batch: ", self.mode_steps[mode]) - self.profiler_config_parser.load_config() if self.profiler_config_parser.should_save_metrics( @@ -1337,7 +1334,6 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: - # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1354,7 +1350,7 @@ def profiling_start_batch(self, mode): if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( @@ -1383,7 +1379,6 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1416,14 +1411,12 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_END_FLAG_FILENAME ): - # print("Stop Dataloader profiling") self.is_dataloader_profiling = False if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: From 4044d3aaba69dc034cd5176f173f217eb0ecce0a Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 23:22:21 -0800 Subject: [PATCH 19/97] clean up code --- smdebug/tensorflow/keras.py | 6 +----- smdebug/tensorflow/utils.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 8485677a9..267d71257 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -105,7 +105,6 @@ def __init__( # Profiling vars self.tf_profiler = None if is_profiler_supported_for_tf_version(): - # Third Party from tensorflow.python.profiler import profiler_v2 as tf_profiler self.tf_profiler = tf_profiler @@ -336,7 +335,6 @@ def _create_tensors_for_matching_collections( def _get_distributed_model(self, mode): # not available in tf 1.13, code shouldn't reach here for 1.13 # because of _is_not_supported - # Third Party from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) @@ -1104,9 +1102,8 @@ def unwrap(func): def close(self): self._cleanup() - # print("\nStep Number in the close function: ", self.step) + if self.python_profiler: - # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1154,7 +1151,6 @@ def run(*args, **kwargs): self._increment_step() if self._get_collections_to_save_for_step(): - # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() if self.last_saved_step is not None and self._exported_collections is False: diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index c26247229..af9a0e901 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -427,10 +427,6 @@ def is_tf_version_greater_than_2_4_x(): return version.parse("2.4.0") <= TF_VERSION -def is_tf_version_greater_than_2_4_x(): - return version.parse("2.4.0") <= TF_VERSION - - def is_profiler_supported_for_tf_version(): # Profiler Support Added For TF Versions 2.2.0 And Greater return version.parse("2.2.0") <= TF_VERSION From 14f0275fc496c7f4bfe1821b2b2a0fb8feba8062 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 23:56:45 -0800 Subject: [PATCH 20/97] update format --- ...est_tf2_python_profiler_cprofiler_config_parser_by_step.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index d568b471c..e51c386c2 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} \ No newline at end of file +} From 794126f69b7fd9bae8fab8211934bc1076219326 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:32:23 -0800 Subject: [PATCH 21/97] add changes for enabling profiling for native tf2 training --- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py new file mode 100644 index 000000000..e69de29bb From 4d1ddc0b7bc446a6cf90c3fd0ade967a1c603f24 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:42:55 -0800 Subject: [PATCH 22/97] add changes for enabling profiling in the native tf2 training --- smdebug/tensorflow/keras.py | 169 +++++++++++++++++- .../tensorflow2/test_native_tf2_profiling.py | 144 +++++++++++++++ 2 files changed, 305 insertions(+), 8 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 49a87419e..1c0ee5805 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -121,6 +121,9 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False + # this flag is used to handle step number increment in the tensorflow native training + # it indicated to profiling for tensorflow2 native training + self.profiling_native_training = False if python_profiler: atexit.register(python_profiler.stop_profiling, StepPhase.END) @@ -1099,7 +1102,9 @@ def unwrap(func): def close(self): self._cleanup() + print('\nStep Number in the close function: ', self.step) if python_profiler: + print('python profiling for end of last train step to end of training') python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1143,7 +1148,9 @@ def run(*args, **kwargs): self._prepare_collections() self.prepared_collections = True - self._increment_step() + if not self.profiling_native_training: + self._increment_step() + print('\nStep number in the push tape: ', self.step) if self._get_collections_to_save_for_step(): self._initialize_writers() @@ -1233,6 +1240,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step + print('\nStep number in the pop tape: ', self.step) return run @@ -1259,13 +1267,13 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # Disable python profiling, because now we are starting wrap tape. - if python_profiler: - python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(self.mode), - end_step=0, - ) + # # Disable python profiling, because now we are starting wrap tape. + # if python_profiler: + # python_profiler.stop_profiling( + # StepPhase.STEP_START, + # end_mode=mode_keys_to_python_profile_mode(self.mode), + # end_step=0, + # ) from tensorflow.python.eager.backprop import GradientTape @@ -1295,3 +1303,148 @@ def record_tensor_value(self, tensor_name, tensor_value): if self._is_collection_being_saved_for_step(CollectionKeys.METRICS): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) + + def start_profiling_start_train_batch(self): + print('Start profiling train batch') + + self.start = time.time() + # santiy check + if self._is_not_supported(): + return + # set mode to TRAIN + self.set_mode(ModeKeys.TRAIN) + + self.profiling_native_training = True + if self.profiling_native_training: + self._increment_step() + # if self.step_incremented_in_on_train_begin is False: + # self._increment_step() + # else: + # self.step_incremented_in_on_train_begin = False + + print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) + + # load the profiler config + self.profiler_config_parser.load_config() + + + # Dataloader + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ) and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_START_FLAG_FILENAME + ): + print('Dataloader profiling') + self.is_dataloader_profiling = True + elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + self.is_dataloader_profiling = False + + # python profiling + if python_profiler: + print('Stop python profiling in start train batch') + python_profiler.stop_profiling( + StepPhase.STEP_START, + end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + end_step=self.mode_steps[ModeKeys.TRAIN], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + print('Start python profiling in start train batch') + python_profiler.start_profiling( + StepPhase.STEP_START, + start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + start_step=self.mode_steps[ModeKeys.TRAIN], + ) + + # detail profiling + if is_profiler_supported_for_tf_version(): + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + if not self.is_detailed_profiling: + self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( + self.profiler_config_parser.config.local_path, + "tensorflow", + self.mode_steps[ModeKeys.TRAIN], + ) + self.logger.info( + f"Enabling TF profiler on step: = {self.mode_steps[ModeKeys.TRAIN]}" + ) + if not self.warm_up_completed: + # warming up profiler before it will be profiling. + self.tf_profiler.warmup() + self.warm_up_completed = True + self.tf_profiler.start(self._log_dir) + self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS + self.is_detailed_profiling = True + elif self.is_detailed_profiling: + self.logger.info( + f"Disabling TF profiler on step: ={self.mode_steps[ModeKeys.TRAIN]}" + ) + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + def start_profiling_end_train_batch(self): + print('End profiling train batch') + print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) + # sanity check + if self._is_not_supported(): + return + + self.record_trace_events( + training_phase="Step:" + str(ModeKeys.TRAIN), + op_name="Step:" + str(ModeKeys.TRAIN), + phase="X", + timestamp=self.start, # this is start time for step + duration=time.time() - self.start, + pid=os.getpid(), + step_num=str(self.mode_steps[ModeKeys.TRAIN]), + ) + + if python_profiler: + print('Stop python profiling in end train batch') + python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + end_step=self.mode_steps[ModeKeys.TRAIN], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + print('Start python profiling in end train batch') + python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + start_step=self.mode_steps[ModeKeys.TRAIN], + ) + + def stop_profiling_end_of_training(self): + print('\nEnd of training!') + print('\nStep Number at the end of training: ', self.step) + + # Alternatively, use self.close to close the python profiling + self.close() + + if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + print('Stop Dataloader profiling') + self.is_dataloader_profiling = False + + if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: + self.logger.info("Disabling profiler, reached end of training.") + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + self.profiling_native_training = False diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index e69de29bb..16b9e121a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -0,0 +1,144 @@ +# Standard Library +import os +from pathlib import Path + +# Third Party +import tensorflow as tf +import pytest +# from tests.tensorflow2.test_keras import helper_keras_fit + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.profiler.profiler_config_parser import ProfilerConfigParser +from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def create_hook(trial_dir): + hook = smd.KerasHook(trial_dir, save_all=True) + return hook + + +def create_model(): + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + return model + +def helper_native_tf2(): + + +def test_gradtape_tf_function(out_dir): + def get_grads(images, labels): + # with tf.GradientTape() as tape: + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(64) + model = create_model() + hook = create_hook(out_dir) + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with hook.wrap_tape(tf.GradientTape()) as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + model.save(out_dir, save_format="tf") + + + trial = smd.create_trial(out_dir) + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ + "weights/dense/kernel:0", + "weights/dense_1/kernel:0", + ] + assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ + "weights/dense/bias:0", + "weights/dense_1/bias:0", + ] + assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] + + +def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) + hook.close() + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 \ No newline at end of file From 64ce546ffe369fbfe985accd099dee1b4265fd01 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 11 Jan 2021 11:08:36 -0800 Subject: [PATCH 23/97] add tests --- smdebug/tensorflow/keras.py | 31 +- ...filer_cprofiler_config_parser_by_step.json | 7 + ...er_pyinstrument_config_parser_by_step.json | 7 + .../tensorflow2/test_native_tf2_profiling.py | 349 +++++++++++++++++- 4 files changed, 358 insertions(+), 36 deletions(-) create mode 100644 tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json create mode 100644 tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1c0ee5805..a87d3a903 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -121,8 +121,8 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag is used to handle step number increment in the tensorflow native training - # it indicated to profiling for tensorflow2 native training + # this flag is used to handle step number increment in the tensorflow native training when profiler is on + # it indicates to profiling for tensorflow2 native training self.profiling_native_training = False if python_profiler: @@ -1044,7 +1044,7 @@ def wrap_optimizer(self, optimizer): if isinstance(optimizer, tf.train.Optimizer): optimizer = self._wrap_apply_gradients(optimizer) elif isinstance(optimizer, tf.keras.optimizers.Optimizer) or is_keras_optimizer(optimizer): - # either subclasse of optimizerV2 class in tf.keras + # either subclass of optimizerV2 class in tf.keras # or keras.optimizers.Optimizer original_get_grads = optimizer.__class__.get_gradients @@ -1305,30 +1305,25 @@ def record_tensor_value(self, tensor_name, tensor_value): self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) def start_profiling_start_train_batch(self): + """ + Enabling profiler at the start of train batch when native tf2 training is used. + """ print('Start profiling train batch') self.start = time.time() - # santiy check + if self._is_not_supported(): return - # set mode to TRAIN self.set_mode(ModeKeys.TRAIN) self.profiling_native_training = True if self.profiling_native_training: self._increment_step() - # if self.step_incremented_in_on_train_begin is False: - # self._increment_step() - # else: - # self.step_incremented_in_on_train_begin = False print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) - # load the profiler config self.profiler_config_parser.load_config() - - # Dataloader if self.profiler_config_parser.should_save_metrics( MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] ) and self.profiler_config_parser.write_tf_dataloader_flag( @@ -1341,7 +1336,6 @@ def start_profiling_start_train_batch(self): ): self.is_dataloader_profiling = False - # python profiling if python_profiler: print('Stop python profiling in start train batch') python_profiler.stop_profiling( @@ -1359,7 +1353,6 @@ def start_profiling_start_train_batch(self): start_step=self.mode_steps[ModeKeys.TRAIN], ) - # detail profiling if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] @@ -1392,9 +1385,12 @@ def start_profiling_start_train_batch(self): self.is_detailed_profiling = False def start_profiling_end_train_batch(self): + """ + Enabling profiler at the end of train batch when native tf2 training is used. + """ print('End profiling train batch') print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) - # sanity check + if self._is_not_supported(): return @@ -1426,10 +1422,13 @@ def start_profiling_end_train_batch(self): ) def stop_profiling_end_of_training(self): + """ + Stop profiler at the end of training when native tf2 training is used. + """ print('\nEnd of training!') print('\nStep Number at the end of training: ', self.step) - # Alternatively, use self.close to close the python profiling + # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json new file mode 100644 index 000000000..2ab039217 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -0,0 +1,7 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2}" + } +} \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json new file mode 100644 index 000000000..325224801 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -0,0 +1,7 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"python_profiler\": \"Pyinstrument\"}" + } +} \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 16b9e121a..3e1f08b2b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -1,19 +1,40 @@ # Standard Library import os +import json +import time +from datetime import datetime from pathlib import Path +import pstats # Third Party import tensorflow as tf import pytest -# from tests.tensorflow2.test_keras import helper_keras_fit # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys +from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents from smdebug.tensorflow import KerasHook as Hook +from smdebug.profiler.profiler_constants import ( + CONVERT_TO_MICROSECS, + DEFAULT_PREFIX, + TRACE_DIRECTORY_FORMAT, + CPROFILE_NAME, + CPROFILE_STATS_FILENAME, + PYINSTRUMENT_HTML_FILENAME, + PYINSTRUMENT_JSON_FILENAME, + PYINSTRUMENT_NAME, +) +from smdebug.profiler.python_profiler import PythonProfiler +from smdebug.profiler.python_profiler import ( + PyinstrumentPythonProfiler, + cProfilePythonProfiler, + cProfileTimer, +) +from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -22,6 +43,20 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): return ProfilerConfigParser() +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + @pytest.fixture() def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") @@ -29,9 +64,24 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -def create_hook(trial_dir): - hook = smd.KerasHook(trial_dir, save_all=True) - return hook +@pytest.fixture +def test_framework(): + return "test-framework" + + +@pytest.fixture() +def cprofile_python_profiler(out_dir, test_framework): + return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) + + +@pytest.fixture() +def pyinstrument_python_profiler(out_dir, test_framework): + return PyinstrumentPythonProfiler(out_dir, test_framework) + + +@pytest.fixture() +def framework_dir(out_dir, test_framework): + return "{0}/framework/{1}".format(out_dir, test_framework) def create_model(): @@ -46,12 +96,10 @@ def create_model(): ) return model -def helper_native_tf2(): +def helper_native_tf2_profiler_debugger(trial_dir, hook): -def test_gradtape_tf_function(out_dir): def get_grads(images, labels): - # with tf.GradientTape() as tape: return model(images, training=True) @tf.function @@ -63,32 +111,29 @@ def train_step(images, labels): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(64) + dataset = dataset.shuffle(1000).batch(128) model = create_model() - hook = create_hook(out_dir) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - dataset_labels = labels labels = tf.one_hot(labels, depth=10) hook.start_profiling_start_train_batch() with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(out_dir, save_format="tf") - + model.save(trial_dir, save_format="tf") - trial = smd.create_trial(out_dir) + trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ "weights/dense/kernel:0", @@ -109,7 +154,40 @@ def train_step(images, labels): assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): +def helper_native_tf2_profiler(trial_dir, hook): + + def get_grads(images, labels): + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(128) + model = create_model() + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with tf.GradientTape() as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): """ This test executes a TF2 native training script, enables detailed TF profiling by step, and verifies the number of events. @@ -117,8 +195,7 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) - hook.close() + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -141,4 +218,236 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 \ No newline at end of file + assert num_trace_events >= 230 + + +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +# @pytest.mark.parametrize("use_pyinstrument", [False, True]) +# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) +def test_native_python_profiling_cprofiler( + out_dir, tf2_python_cprofiler_config_parser_by_step +): + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + config = tf2_python_cprofiler_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + config = tf2_python_pyinstrument_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.profiler_name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is meant to test successful creation of the timeline file according to file path specification. + $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ + {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict \ No newline at end of file From 7e9c830627cb4317da9594917eceb737c90a9716 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 12:05:19 -0800 Subject: [PATCH 24/97] add python profiler as attr for kerashook --- smdebug/tensorflow/keras.py | 57 ++++++++++++------- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 6 +- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index a87d3a903..009efe230 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,7 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) +print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -125,7 +126,11 @@ def __init__( # it indicates to profiling for tensorflow2 native training self.profiling_native_training = False - if python_profiler: + self.python_profiler = python_profiler + print('\nObject inside: ', self.python_profiler) + + if self.python_profiler: + print('exit stop profiling object inside: ', self.python_profiler, self.step) atexit.register(python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): @@ -753,13 +758,13 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], ) - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -837,8 +842,8 @@ def _on_any_batch_begin(self, batch, mode, logs=None): ): self.is_dataloader_profiling = False - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], @@ -846,7 +851,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -1007,8 +1012,8 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], @@ -1016,7 +1021,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -1103,14 +1108,16 @@ def unwrap(func): def close(self): self._cleanup() print('\nStep Number in the close function: ', self.step) - if python_profiler: + if self.python_profiler: print('python profiling for end of last train step to end of training') - python_profiler.start_profiling( + print('close start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) + def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1308,7 +1315,15 @@ def start_profiling_start_train_batch(self): """ Enabling profiler at the start of train batch when native tf2 training is used. """ - print('Start profiling train batch') + + # print('Start profiling train batch') + # a = PythonProfiler.get_python_profiler(profiler_config_parser.config, 'tensorflow') + # print('\nProfiler enabled: ', a) + # + # print('\nname: ', self.profiler_config_parser.config.python_profiling_config.name) + # print('\npython profiling: ', self.python_profiler) + # print('\nstart_step: ', self.profiler_config_parser.config.python_profiling_config.start_step) + # print('\nnum_steps: ', self.profiler_config_parser.config.python_profiling_config.num_steps) self.start = time.time() @@ -1336,9 +1351,10 @@ def start_profiling_start_train_batch(self): ): self.is_dataloader_profiling = False - if python_profiler: + if self.python_profiler: print('Stop python profiling in start train batch') - python_profiler.stop_profiling( + print('start train batch stop profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), end_step=self.mode_steps[ModeKeys.TRAIN], @@ -1347,7 +1363,8 @@ def start_profiling_start_train_batch(self): MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] ): print('Start python profiling in start train batch') - python_profiler.start_profiling( + print('start train batch start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), start_step=self.mode_steps[ModeKeys.TRAIN], @@ -1404,9 +1421,10 @@ def start_profiling_end_train_batch(self): step_num=str(self.mode_steps[ModeKeys.TRAIN]), ) - if python_profiler: + if self.python_profiler: print('Stop python profiling in end train batch') - python_profiler.stop_profiling( + print('end train batch stop profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), end_step=self.mode_steps[ModeKeys.TRAIN], @@ -1415,7 +1433,8 @@ def start_profiling_end_train_batch(self): MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] ): print('Start python profiling in end train batch') - python_profiler.start_profiling( + print('end train batch start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), start_step=self.mode_steps[ModeKeys.TRAIN], diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 325224801..02bc8c0d3 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"python_profiler\": \"Pyinstrument\"}" + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" } } \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 3e1f08b2b..9ba8dd60b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -173,6 +173,8 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + + # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -338,6 +340,7 @@ def test_native_python_profiling_cprofiler( print('\nnum_steps: ', config.python_profiling_config.num_steps) python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -347,6 +350,7 @@ def test_native_python_profiling_cprofiler( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. @@ -384,7 +388,7 @@ def test_native_python_profiling_pyinstrument( print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps From 5e2172a592329c3dce2cff131a2f878f70a93408 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 16:01:59 -0800 Subject: [PATCH 25/97] add tests --- smdebug/tensorflow/keras.py | 2 +- ...filer_cprofiler_config_parser_by_step.json | 2 +- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 34 ++++++++++++------- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 009efe230..d31017179 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -131,7 +131,7 @@ def __init__( if self.python_profiler: print('exit stop profiling object inside: ', self.python_profiler, self.step) - atexit.register(python_profiler.stop_profiling, StepPhase.END) + atexit.register(self.python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): if self.distribution_strategy is None: diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index 2ab039217..d568b471c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2}" + "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } } \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 02bc8c0d3..c1c45594c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" + "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 9ba8dd60b..9a7416be3 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path import pstats +import atexit # Third Party import tensorflow as tf @@ -84,6 +85,18 @@ def framework_dir(out_dir, test_framework): return "{0}/framework/{1}".format(out_dir, test_framework) +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig + python_profiler = None + if profiler_config_parser.profiling_enabled: + config = profiler_config_parser.config + if config.python_profiling_config.is_enabled(): + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + python_profiler.start_profiling(StepPhase.START) + atexit.register(python_profiler.stop_profiling, StepPhase.END) + return profiler_config_parser, python_profiler + + def create_model(): model = tf.keras.models.Sequential( [ @@ -131,7 +144,7 @@ def train_step(images, labels): hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(trial_dir, save_format="tf") + # model.save(trial_dir, save_format="tf") trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] @@ -173,8 +186,6 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - - # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -196,7 +207,7 @@ def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parse """ assert tf2_profiler_config_parser_by_step.profiling_enabled - hook = Hook(out_dir=out_dir) + hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -333,13 +344,13 @@ def test_native_python_profiling_cprofiler( ): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - config = tf2_python_cprofiler_config_parser_by_step.config + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + + config = profiler_config_parser.config print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) print('\nname: ', config.python_profiling_config.name) print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps @@ -382,13 +393,9 @@ def test_native_python_profiling_pyinstrument( ): assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - config = tf2_python_pyinstrument_config_parser_by_step.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.profiler_name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -398,6 +405,7 @@ def test_native_python_profiling_pyinstrument( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. From d45c586e5985a3c379c92a6d03c645e242c52b8d Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 14:16:24 -0800 Subject: [PATCH 26/97] update profiler for native tf training --- smdebug/tensorflow/keras.py | 138 +++--- ...iler_all_params_config_parser_by_step.json | 8 + ...filer_cprofiler_config_parser_by_step.json | 2 +- ...er_pyinstrument_config_parser_by_step.json | 2 +- ...ofiling.py => test_native_tf2_profiler.py} | 397 +++++++++--------- 5 files changed, 271 insertions(+), 276 deletions(-) create mode 100644 tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json rename tests/profiler/tensorflow2/{test_native_tf2_profiling.py => test_native_tf2_profiler.py} (57%) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index d31017179..dbfc8bf1d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,7 +61,7 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -print('prezero-step start profiling object outside: ', python_profiler) +# print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -106,6 +106,7 @@ def __init__( # Profiling vars self.tf_profiler = None if is_profiler_supported_for_tf_version(): + # Third Party from tensorflow.python.profiler import profiler_v2 as tf_profiler self.tf_profiler = tf_profiler @@ -114,6 +115,7 @@ def __init__( self.is_dataloader_profiling = False self.tf_profiler_start_time_in_micros = 0 self.warm_up_completed = False + self.python_profiler = python_profiler # supports_tf_logs property was introduced in TF 2.3.0 # it indicates to the framework that the callback is not # limited to reading only numpy logs @@ -122,15 +124,10 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag is used to handle step number increment in the tensorflow native training when profiler is on - # it indicates to profiling for tensorflow2 native training - self.profiling_native_training = False - - self.python_profiler = python_profiler - print('\nObject inside: ', self.python_profiler) + # this flag indicates to debugging for tensorflow2 native training + self.debugger_native_training = False if self.python_profiler: - print('exit stop profiling object inside: ', self.python_profiler, self.step) atexit.register(self.python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): @@ -340,6 +337,7 @@ def _create_tensors_for_matching_collections( def _get_distributed_model(self, mode): # not available in tf 1.13, code shouldn't reach here for 1.13 # because of _is_not_supported + # Third Party from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) @@ -1107,16 +1105,15 @@ def unwrap(func): def close(self): self._cleanup() - print('\nStep Number in the close function: ', self.step) + print("\nStep Number in the close function: ", self.step) if self.python_profiler: - print('python profiling for end of last train step to end of training') - print('close start profiling object inside: ', self.python_profiler, self.step) + # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - + self.debugger_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1155,11 +1152,11 @@ def run(*args, **kwargs): self._prepare_collections() self.prepared_collections = True - if not self.profiling_native_training: - self._increment_step() - print('\nStep number in the push tape: ', self.step) + self._increment_step() + print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): + # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() if self.last_saved_step is not None and self._exported_collections is False: @@ -1247,7 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - print('\nStep number in the pop tape: ', self.step) + print("\nStep number in the pop tape: ", self.step) return run @@ -1274,16 +1271,12 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # # Disable python profiling, because now we are starting wrap tape. - # if python_profiler: - # python_profiler.stop_profiling( - # StepPhase.STEP_START, - # end_mode=mode_keys_to_python_profile_mode(self.mode), - # end_step=0, - # ) - + # Third Party from tensorflow.python.eager.backprop import GradientTape + self.debugger_native_training = True + self.set_mode(ModeKeys.TRAIN) + if isinstance(tape, GradientTape): # unwrap tape before wrapping new tape to avoid recursive wrap tapes if self.tape: @@ -1311,78 +1304,67 @@ def record_tensor_value(self, tensor_name, tensor_value): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) - def start_profiling_start_train_batch(self): + def profiling_start_batch(self, mode): """ Enabling profiler at the start of train batch when native tf2 training is used. """ - - # print('Start profiling train batch') - # a = PythonProfiler.get_python_profiler(profiler_config_parser.config, 'tensorflow') - # print('\nProfiler enabled: ', a) - # - # print('\nname: ', self.profiler_config_parser.config.python_profiling_config.name) - # print('\npython profiling: ', self.python_profiler) - # print('\nstart_step: ', self.profiler_config_parser.config.python_profiling_config.start_step) - # print('\nnum_steps: ', self.profiler_config_parser.config.python_profiling_config.num_steps) - self.start = time.time() if self._is_not_supported(): return - self.set_mode(ModeKeys.TRAIN) - self.profiling_native_training = True - if self.profiling_native_training: - self._increment_step() + self.set_mode(mode) - print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) + if not self.debugger_native_training: + self.step += 1 + self.mode_steps[self.mode] += 1 + # Increment Global step number irrespective of what mode it is + if self.mode != ModeKeys.GLOBAL: + self.mode_steps[ModeKeys.GLOBAL] = self.step + + print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] ) and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_START_FLAG_FILENAME ): - print('Dataloader profiling') self.is_dataloader_profiling = True elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): self.is_dataloader_profiling = False if self.python_profiler: - print('Stop python profiling in start train batch') - print('start train batch stop profiling object inside: ', self.python_profiler, self.step) + # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - end_step=self.mode_steps[ModeKeys.TRAIN], + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], ) if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - print('Start python profiling in start train batch') - print('start train batch start profiling object inside: ', self.python_profiler, self.step) + # print("Start python profiling in start train batch") self.python_profiler.start_profiling( StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - start_step=self.mode_steps[ModeKeys.TRAIN], + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], ) if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( self.profiler_config_parser.config.local_path, "tensorflow", - self.mode_steps[ModeKeys.TRAIN], - ) - self.logger.info( - f"Enabling TF profiler on step: = {self.mode_steps[ModeKeys.TRAIN]}" + self.mode_steps[mode], ) + self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") if not self.warm_up_completed: # warming up profiler before it will be profiling. self.tf_profiler.warmup() @@ -1391,9 +1373,7 @@ def start_profiling_start_train_batch(self): self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS self.is_detailed_profiling = True elif self.is_detailed_profiling: - self.logger.info( - f"Disabling TF profiler on step: ={self.mode_steps[ModeKeys.TRAIN]}" - ) + self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") stop_tf_profiler( tf_profiler=self.tf_profiler, log_dir=self._log_dir, @@ -1401,59 +1381,55 @@ def start_profiling_start_train_batch(self): ) self.is_detailed_profiling = False - def start_profiling_end_train_batch(self): + def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - print('End profiling train batch') - print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) + print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return self.record_trace_events( - training_phase="Step:" + str(ModeKeys.TRAIN), - op_name="Step:" + str(ModeKeys.TRAIN), + training_phase="Step:" + str(mode), + op_name="Step:" + str(mode), phase="X", timestamp=self.start, # this is start time for step duration=time.time() - self.start, pid=os.getpid(), - step_num=str(self.mode_steps[ModeKeys.TRAIN]), + step_num=str(self.mode_steps[mode]), ) if self.python_profiler: - print('Stop python profiling in end train batch') - print('end train batch stop profiling object inside: ', self.python_profiler, self.step) + # print("Stop python profiling in end train batch") self.python_profiler.stop_profiling( StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - end_step=self.mode_steps[ModeKeys.TRAIN], + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], ) if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - print('Start python profiling in end train batch') - print('end train batch start profiling object inside: ', self.python_profiler, self.step) + # print("Start python profiling in end train batch") self.python_profiler.start_profiling( StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - start_step=self.mode_steps[ModeKeys.TRAIN], + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], ) - def stop_profiling_end_of_training(self): + def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - print('\nEnd of training!') - print('\nStep Number at the end of training: ', self.step) + print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): - print('Stop Dataloader profiling') + # print("Stop Dataloader profiling") self.is_dataloader_profiling = False if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: @@ -1464,5 +1440,3 @@ def stop_profiling_end_of_training(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False - - self.profiling_native_training = False diff --git a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json new file mode 100644 index 000000000..c119eebf8 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json @@ -0,0 +1,8 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "DetailedProfilingConfig": "{\"StartStep\": 2, \"NumSteps\": 2}", + "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" + } +} diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index d568b471c..e51c386c2 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} \ No newline at end of file +} diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index c1c45594c..53ac1485e 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } -} \ No newline at end of file +} diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py similarity index 57% rename from tests/profiler/tensorflow2/test_native_tf2_profiling.py rename to tests/profiler/tensorflow2/test_native_tf2_profiler.py index 9a7416be3..be2b6e124 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -1,41 +1,36 @@ # Standard Library -import os +import atexit import json +import os +import pstats import time from datetime import datetime from pathlib import Path -import pstats -import atexit # Third Party -import tensorflow as tf import pytest +import tensorflow as tf # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys -from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser -from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX -from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents -from smdebug.tensorflow import KerasHook as Hook from smdebug.profiler.profiler_constants import ( CONVERT_TO_MICROSECS, - DEFAULT_PREFIX, - TRACE_DIRECTORY_FORMAT, CPROFILE_NAME, CPROFILE_STATS_FILENAME, + DEFAULT_PREFIX, PYINSTRUMENT_HTML_FILENAME, PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_NAME, + TENSORBOARDTIMELINE_SUFFIX, + TRACE_DIRECTORY_FORMAT, ) +from smdebug.profiler.python_profile_utils import StepPhase from smdebug.profiler.python_profiler import PythonProfiler -from smdebug.profiler.python_profiler import ( - PyinstrumentPythonProfiler, - cProfilePythonProfiler, - cProfileTimer, -) -from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -46,14 +41,18 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @pytest.fixture() def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json" + ) monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) return ProfilerConfigParser() @pytest.fixture() def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json" + ) monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) return ProfilerConfigParser() @@ -65,24 +64,13 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -@pytest.fixture -def test_framework(): - return "test-framework" - - @pytest.fixture() -def cprofile_python_profiler(out_dir, test_framework): - return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) - - -@pytest.fixture() -def pyinstrument_python_profiler(out_dir, test_framework): - return PyinstrumentPythonProfiler(out_dir, test_framework) - - -@pytest.fixture() -def framework_dir(out_dir, test_framework): - return "{0}/framework/{1}".format(out_dir, test_framework) +def tf2_profiler_config_parser_by_step_all_params(config_folder, monkeypatch): + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_all_params_config_parser_by_step.json" + ) + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() def set_up_profiling(profilerconfig): @@ -110,65 +98,19 @@ def create_model(): return model -def helper_native_tf2_profiler_debugger(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - +def prepare_dataset(): mnist = tf.keras.datasets.mnist (x_train, y_train), _ = mnist.load_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with hook.wrap_tape(tf.GradientTape()) as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - # model.save(trial_dir, save_format="tf") - - trial = smd.create_trial(trial_dir) - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ - "weights/dense/kernel:0", - "weights/dense_1/kernel:0", - ] - assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ - "weights/dense/bias:0", - "weights/dense_1/bias:0", - ] - assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] - + dataset = dataset.shuffle(1000).batch(64) + return dataset -def helper_native_tf2_profiler(trial_dir, hook): +def helper_native_tf2_gradtape( + hook, debugger=False, python_profiler=None, start_step=None, end_step=None +): def get_grads(images, labels): return model(images, training=True) @@ -176,73 +118,51 @@ def get_grads(images, labels): def train_step(images, labels): return tf.reduce_mean(get_grads(images, labels)) - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) + dataset = prepare_dataset() model = create_model() opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + current_step = 0 n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with tf.GradientTape() as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() + if debugger: + with hook.wrap_tape(tf.GradientTape()) as tape: + hook.profiling_start_batch(mode=smd.modes.TRAIN) + logits = train_step(data, labels) + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step + assert python_profiler._start_phase == StepPhase.STEP_START + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + else: + with tf.GradientTape() as tape: + hook.profiling_start_batch(mode=smd.modes.TRAIN) + logits = train_step(data, labels) + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step + assert python_profiler._start_phase == StepPhase.STEP_START + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.profiling_end_batch(mode=smd.modes.TRAIN) + hook.profiling_end() @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and + This test executes a TF2 native training script with profiler, enables detailed TF profiling by step, and verifies the number of events. """ assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) t_events = TensorboardProfilerEvents() @@ -269,15 +189,15 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and + This test executes a TF2 native training script with profiler, enables detailed TF profiling by time, and verifies the number of events. """ assert tf2_profiler_config_parser_by_time.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) # get tensorboard timeline files files = [] @@ -302,67 +222,33 @@ def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parse # consecutive runs. Hence, the approximation in the below asserts. assert num_trace_events >= 700 + @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): +def test_native_python_profiling_cprofiler(out_dir, tf2_python_cprofiler_config_parser_by_step): """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. + This test executes a TF2 native training script with profiler, enables cprofiler by step, and + verifies the python profiling's steps and expected output files. """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -# @pytest.mark.parametrize("use_pyinstrument", [False, True]) -# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) -def test_native_python_profiling_cprofiler( - out_dir, tf2_python_cprofiler_config_parser_by_step -): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + profiler_config_parser, python_profiler = set_up_profiling( + tf2_python_cprofiler_config_parser_by_step + ) config = profiler_config_parser.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) - print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps profiler_name = CPROFILE_NAME allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) hook = Hook(out_dir=out_dir) hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape( + hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) # Test that directory and corresponding files exist. assert os.path.isdir(python_stats_dir) @@ -370,6 +256,9 @@ def test_native_python_profiling_cprofiler( for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is (end_step - start_step) * 2 + 2 - 1. assert len(stats_dirs) == (end_step - start_step) * 2 + 1 for stats_dir in stats_dirs: @@ -388,12 +277,19 @@ def test_native_python_profiling_cprofiler( assert json.load(f) +@pytest.mark.skip_if_non_eager def test_native_python_profiling_pyinstrument( out_dir, tf2_python_pyinstrument_config_parser_by_step ): + """ + This test executes a TF2 native training script with profiler, enables pyinstrument by step, and + verifies the python profiling's steps and expected output files. + """ assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) + profiler_config_parser, python_profiler = set_up_profiling( + tf2_python_pyinstrument_config_parser_by_step + ) config = profiler_config_parser.config start_step = config.python_profiling_config.start_step @@ -402,11 +298,13 @@ def test_native_python_profiling_pyinstrument( profiler_name = PYINSTRUMENT_NAME allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) hook = Hook(out_dir=out_dir) hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape( + hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) # Test that directory and corresponding files exist. assert os.path.isdir(python_stats_dir) @@ -414,6 +312,9 @@ def test_native_python_profiling_pyinstrument( for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is (end_step - start_step) * 2 + 2 - 1. assert len(stats_dirs) == (end_step - start_step) * 2 + 1 for stats_dir in stats_dirs: @@ -432,17 +333,16 @@ def test_native_python_profiling_pyinstrument( assert json.load(f) +@pytest.mark.skip_if_non_eager def test_create_timeline_file(simple_profiler_config_parser, out_dir): """ - This test is meant to test successful creation of the timeline file according to file path specification. - $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ - {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + This test is to test the creation of the timeline file according to file path specification. It reads backs the file contents to make sure it is in valid JSON format. """ assert simple_profiler_config_parser.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) files = [] for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): @@ -462,4 +362,117 @@ def test_create_timeline_file(simple_profiler_config_parser, out_dir): with open(files[0]) as timeline_file: events_dict = json.load(timeline_file) - assert events_dict \ No newline at end of file + assert events_dict + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_debugger_all_params( + tf2_profiler_config_parser_by_step_all_params, out_dir +): + """ + This test executes a TF2 native training script with debugger and profiler, enables detailed TF profiling, python + profiling by step. + """ + assert tf2_profiler_config_parser_by_step_all_params.profiling_enabled + + profiler_config_parser, python_profiler = set_up_profiling( + tf2_profiler_config_parser_by_step_all_params + ) + + config = profiler_config_parser.config + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) + + hook = Hook(out_dir=out_dir, save_all=True) + hook.python_profiler = python_profiler + helper_native_tf2_gradtape(hook=hook, debugger=True) + + # Verifying python profiling related files. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + # Verifying detailed TF profiling. + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path( + tf2_profiler_config_parser_by_step_all_params.config.local_path + "/framework" + ).rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + # Verifying timeline files. + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict + + # Verifying tensor names. + trial = smd.create_trial(out_dir) + assert len(trial.steps()) > 0, "Nothing saved at any step." + assert len(trial.tensor_names()) > 0, "Tensors were not saved." + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 + assert trial.tensor_names(collection="optimizer_variables") == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] From 1cd9c7eeeedb51597ed1cb15d6ca94419ab236be Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 15:25:40 -0700 Subject: [PATCH 27/97] Modify distributed_training_utils.py import for TF 2.4 (#422) --- smdebug/tensorflow/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index af9a0e901..11d700dfc 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -427,6 +427,10 @@ def is_tf_version_greater_than_2_4_x(): return version.parse("2.4.0") <= TF_VERSION +def is_tf_version_greater_than_2_4_x(): + return version.parse("2.4.0") <= version.parse(tf.__version__) + + def is_profiler_supported_for_tf_version(): # Profiler Support Added For TF Versions 2.2.0 And Greater return version.parse("2.2.0") <= TF_VERSION From e0da3aaa8a102d5a52ea32d5604c58dbf1a54865 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 19:48:15 -0800 Subject: [PATCH 28/97] remove print statement --- smdebug/tensorflow/keras.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index dbfc8bf1d..439ee09a8 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1105,7 +1105,7 @@ def unwrap(func): def close(self): self._cleanup() - print("\nStep Number in the close function: ", self.step) + # print("\nStep Number in the close function: ", self.step) if self.python_profiler: # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( @@ -1153,7 +1153,7 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - print("\nStep number in the push tape: ", self.step) + # print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1244,7 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - print("\nStep number in the pop tape: ", self.step) + # print("\nStep number in the pop tape: ", self.step) return run @@ -1322,7 +1322,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - print("Step Number in start train batch: ", self.mode_steps[mode]) + # print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1385,7 +1385,7 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - print("Step Number in end train batch: ", self.mode_steps[mode]) + # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1421,7 +1421,7 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - print("Step Number at the end of training: ", self.step) + # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() From dbdf4a199cba2d15ed847b357a244fb92322278a Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:32:23 -0800 Subject: [PATCH 29/97] add changes for enabling profiling for native tf2 training --- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py new file mode 100644 index 000000000..e69de29bb From 4e1fd390fca1a0b632b0874179e237e6a0a3ced9 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:42:55 -0800 Subject: [PATCH 30/97] add changes for enabling profiling in the native tf2 training --- smdebug/tensorflow/keras.py | 7 +- .../tensorflow2/test_native_tf2_profiling.py | 144 ++++++++++++++++++ 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 439ee09a8..22ae7b588 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1271,7 +1271,6 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # Third Party from tensorflow.python.eager.backprop import GradientTape self.debugger_native_training = True @@ -1356,7 +1355,7 @@ def profiling_start_batch(self, mode): if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( @@ -1386,7 +1385,6 @@ def profiling_end_batch(self, mode): Enabling profiler at the end of train batch when native tf2 training is used. """ # print("Step Number in end train batch: ", self.mode_steps[mode]) - if self._is_not_supported(): return @@ -1427,7 +1425,7 @@ def profiling_end(self): self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): # print("Stop Dataloader profiling") self.is_dataloader_profiling = False @@ -1440,3 +1438,4 @@ def profiling_end(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False + diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index e69de29bb..16b9e121a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -0,0 +1,144 @@ +# Standard Library +import os +from pathlib import Path + +# Third Party +import tensorflow as tf +import pytest +# from tests.tensorflow2.test_keras import helper_keras_fit + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.profiler.profiler_config_parser import ProfilerConfigParser +from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def create_hook(trial_dir): + hook = smd.KerasHook(trial_dir, save_all=True) + return hook + + +def create_model(): + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + return model + +def helper_native_tf2(): + + +def test_gradtape_tf_function(out_dir): + def get_grads(images, labels): + # with tf.GradientTape() as tape: + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(64) + model = create_model() + hook = create_hook(out_dir) + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with hook.wrap_tape(tf.GradientTape()) as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + model.save(out_dir, save_format="tf") + + + trial = smd.create_trial(out_dir) + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ + "weights/dense/kernel:0", + "weights/dense_1/kernel:0", + ] + assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ + "weights/dense/bias:0", + "weights/dense_1/bias:0", + ] + assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] + + +def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) + hook.close() + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 \ No newline at end of file From cf29f10f96ba01776080055febe3d0f683477a0b Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 11 Jan 2021 11:08:36 -0800 Subject: [PATCH 31/97] add tests --- ...filer_cprofiler_config_parser_by_step.json | 1 + ...er_pyinstrument_config_parser_by_step.json | 1 + .../tensorflow2/test_native_tf2_profiling.py | 349 +++++++++++++++++- 3 files changed, 331 insertions(+), 20 deletions(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index e51c386c2..f06218f77 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -5,3 +5,4 @@ "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } } + diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 53ac1485e..ad5a555f7 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -5,3 +5,4 @@ "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } + diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 16b9e121a..3e1f08b2b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -1,19 +1,40 @@ # Standard Library import os +import json +import time +from datetime import datetime from pathlib import Path +import pstats # Third Party import tensorflow as tf import pytest -# from tests.tensorflow2.test_keras import helper_keras_fit # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys +from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents from smdebug.tensorflow import KerasHook as Hook +from smdebug.profiler.profiler_constants import ( + CONVERT_TO_MICROSECS, + DEFAULT_PREFIX, + TRACE_DIRECTORY_FORMAT, + CPROFILE_NAME, + CPROFILE_STATS_FILENAME, + PYINSTRUMENT_HTML_FILENAME, + PYINSTRUMENT_JSON_FILENAME, + PYINSTRUMENT_NAME, +) +from smdebug.profiler.python_profiler import PythonProfiler +from smdebug.profiler.python_profiler import ( + PyinstrumentPythonProfiler, + cProfilePythonProfiler, + cProfileTimer, +) +from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -22,6 +43,20 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): return ProfilerConfigParser() +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + @pytest.fixture() def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") @@ -29,9 +64,24 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -def create_hook(trial_dir): - hook = smd.KerasHook(trial_dir, save_all=True) - return hook +@pytest.fixture +def test_framework(): + return "test-framework" + + +@pytest.fixture() +def cprofile_python_profiler(out_dir, test_framework): + return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) + + +@pytest.fixture() +def pyinstrument_python_profiler(out_dir, test_framework): + return PyinstrumentPythonProfiler(out_dir, test_framework) + + +@pytest.fixture() +def framework_dir(out_dir, test_framework): + return "{0}/framework/{1}".format(out_dir, test_framework) def create_model(): @@ -46,12 +96,10 @@ def create_model(): ) return model -def helper_native_tf2(): +def helper_native_tf2_profiler_debugger(trial_dir, hook): -def test_gradtape_tf_function(out_dir): def get_grads(images, labels): - # with tf.GradientTape() as tape: return model(images, training=True) @tf.function @@ -63,32 +111,29 @@ def train_step(images, labels): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(64) + dataset = dataset.shuffle(1000).batch(128) model = create_model() - hook = create_hook(out_dir) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - dataset_labels = labels labels = tf.one_hot(labels, depth=10) hook.start_profiling_start_train_batch() with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(out_dir, save_format="tf") - + model.save(trial_dir, save_format="tf") - trial = smd.create_trial(out_dir) + trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ "weights/dense/kernel:0", @@ -109,7 +154,40 @@ def train_step(images, labels): assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): +def helper_native_tf2_profiler(trial_dir, hook): + + def get_grads(images, labels): + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(128) + model = create_model() + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with tf.GradientTape() as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): """ This test executes a TF2 native training script, enables detailed TF profiling by step, and verifies the number of events. @@ -117,8 +195,7 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) - hook.close() + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -141,4 +218,236 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 \ No newline at end of file + assert num_trace_events >= 230 + + +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +# @pytest.mark.parametrize("use_pyinstrument", [False, True]) +# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) +def test_native_python_profiling_cprofiler( + out_dir, tf2_python_cprofiler_config_parser_by_step +): + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + config = tf2_python_cprofiler_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + config = tf2_python_pyinstrument_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.profiler_name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is meant to test successful creation of the timeline file according to file path specification. + $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ + {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict \ No newline at end of file From 7e9af0db3b7fc4d17df4d0fddc5504554be257cb Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 12:05:19 -0800 Subject: [PATCH 32/97] add python profiler as attr for kerashook --- smdebug/tensorflow/keras.py | 9 ++------- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 6 +++++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 22ae7b588..e1111045b 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,7 +61,6 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -# print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -1105,9 +1104,8 @@ def unwrap(func): def close(self): self._cleanup() - # print("\nStep Number in the close function: ", self.step) + if self.python_profiler: - # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1115,6 +1113,7 @@ def close(self): ) self.debugger_native_training = False + def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1337,7 +1336,6 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: - # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1346,7 +1344,6 @@ def profiling_start_batch(self, mode): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - # print("Start python profiling in start train batch") self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(mode), @@ -1399,7 +1396,6 @@ def profiling_end_batch(self, mode): ) if self.python_profiler: - # print("Stop python profiling in end train batch") self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1408,7 +1404,6 @@ def profiling_end_batch(self, mode): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - # print("Start python profiling in end train batch") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 3e1f08b2b..9ba8dd60b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -173,6 +173,8 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + + # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -338,6 +340,7 @@ def test_native_python_profiling_cprofiler( print('\nnum_steps: ', config.python_profiling_config.num_steps) python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -347,6 +350,7 @@ def test_native_python_profiling_cprofiler( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. @@ -384,7 +388,7 @@ def test_native_python_profiling_pyinstrument( print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps From 1acfdb5e76209ad531501722ecf42d2c94400836 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 16:01:59 -0800 Subject: [PATCH 33/97] add tests --- smdebug/tensorflow/keras.py | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 34 ++++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e1111045b..2b8c2a9eb 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -316,7 +316,7 @@ def _create_tensors_for_matching_collections( for t in tensor_refs: self.tensor_to_collections[t.name] = colls_with_tensor elif colls_with_tensor: - # we should only readd tensors which were already added if these are variables + # we should only read tensors which were already added if these are variables # other tensors are part of a different mode, and will cause a crash if fetched # because their input placeholders will not be passed. if any( diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 9ba8dd60b..9a7416be3 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path import pstats +import atexit # Third Party import tensorflow as tf @@ -84,6 +85,18 @@ def framework_dir(out_dir, test_framework): return "{0}/framework/{1}".format(out_dir, test_framework) +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig + python_profiler = None + if profiler_config_parser.profiling_enabled: + config = profiler_config_parser.config + if config.python_profiling_config.is_enabled(): + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + python_profiler.start_profiling(StepPhase.START) + atexit.register(python_profiler.stop_profiling, StepPhase.END) + return profiler_config_parser, python_profiler + + def create_model(): model = tf.keras.models.Sequential( [ @@ -131,7 +144,7 @@ def train_step(images, labels): hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(trial_dir, save_format="tf") + # model.save(trial_dir, save_format="tf") trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] @@ -173,8 +186,6 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - - # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -196,7 +207,7 @@ def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parse """ assert tf2_profiler_config_parser_by_step.profiling_enabled - hook = Hook(out_dir=out_dir) + hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -333,13 +344,13 @@ def test_native_python_profiling_cprofiler( ): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - config = tf2_python_cprofiler_config_parser_by_step.config + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + + config = profiler_config_parser.config print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) print('\nname: ', config.python_profiling_config.name) print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps @@ -382,13 +393,9 @@ def test_native_python_profiling_pyinstrument( ): assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - config = tf2_python_pyinstrument_config_parser_by_step.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.profiler_name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -398,6 +405,7 @@ def test_native_python_profiling_pyinstrument( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. From 68f683836a76328ce7cb8644c466a1893188f6ba Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 14:16:24 -0800 Subject: [PATCH 34/97] update profiler for native tf training --- smdebug/tensorflow/keras.py | 24 +- ...filer_cprofiler_config_parser_by_step.json | 3 +- ...er_pyinstrument_config_parser_by_step.json | 1 - .../tensorflow2/test_native_tf2_profiling.py | 465 ------------------ 4 files changed, 16 insertions(+), 477 deletions(-) delete mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2b8c2a9eb..72f15e9f0 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,10 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) +<<<<<<< HEAD +======= +# print('prezero-step start profiling object outside: ', python_profiler) +>>>>>>> update profiler for native tf training class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -144,6 +148,7 @@ def _is_not_supported(self): self._hook_supported = False elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: try: +<<<<<<< HEAD if is_tf_version_greater_than_2_4_x(): # distributed_training_utils.py renamed to distributed_training_utils_v1 in tf 2.4.0 from tensorflow.python.keras.distribute.distributed_training_utils_v1 import ( @@ -154,6 +159,12 @@ def _is_not_supported(self): get_distributed_model, ) +======= + # Third Party + from tensorflow.python.keras.distribute.distributed_training_utils import ( + get_distributed_model, + ) +>>>>>>> update profiler for native tf training except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( @@ -1106,6 +1117,7 @@ def close(self): self._cleanup() if self.python_profiler: + # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1113,7 +1125,6 @@ def close(self): ) self.debugger_native_training = False - def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1152,7 +1163,6 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - # print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1243,7 +1253,6 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - # print("\nStep number in the pop tape: ", self.step) return run @@ -1320,7 +1329,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - # print("Step Number in start train batch: ", self.mode_steps[mode]) + print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1336,6 +1345,7 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: + # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1381,7 +1391,6 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1414,13 +1423,11 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - # print("Step Number at the end of training: ", self.step) - # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): # print("Stop Dataloader profiling") self.is_dataloader_profiling = False @@ -1433,4 +1440,3 @@ def profiling_end(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False - diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index f06218f77..d568b471c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,5 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} - +} \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index ad5a555f7..53ac1485e 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -5,4 +5,3 @@ "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } - diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py deleted file mode 100644 index 9a7416be3..000000000 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ /dev/null @@ -1,465 +0,0 @@ -# Standard Library -import os -import json -import time -from datetime import datetime -from pathlib import Path -import pstats -import atexit - -# Third Party -import tensorflow as tf -import pytest - -# First Party -import smdebug.tensorflow as smd -from smdebug.core.collection import CollectionKeys -from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter -from smdebug.profiler.profiler_config_parser import ProfilerConfigParser -from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX -from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents -from smdebug.tensorflow import KerasHook as Hook -from smdebug.profiler.profiler_constants import ( - CONVERT_TO_MICROSECS, - DEFAULT_PREFIX, - TRACE_DIRECTORY_FORMAT, - CPROFILE_NAME, - CPROFILE_STATS_FILENAME, - PYINSTRUMENT_HTML_FILENAME, - PYINSTRUMENT_JSON_FILENAME, - PYINSTRUMENT_NAME, -) -from smdebug.profiler.python_profiler import PythonProfiler -from smdebug.profiler.python_profiler import ( - PyinstrumentPythonProfiler, - cProfilePythonProfiler, - cProfileTimer, -) -from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase - -@pytest.fixture() -def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture -def test_framework(): - return "test-framework" - - -@pytest.fixture() -def cprofile_python_profiler(out_dir, test_framework): - return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) - - -@pytest.fixture() -def pyinstrument_python_profiler(out_dir, test_framework): - return PyinstrumentPythonProfiler(out_dir, test_framework) - - -@pytest.fixture() -def framework_dir(out_dir, test_framework): - return "{0}/framework/{1}".format(out_dir, test_framework) - - -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig - python_profiler = None - if profiler_config_parser.profiling_enabled: - config = profiler_config_parser.config - if config.python_profiling_config.is_enabled(): - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") - python_profiler.start_profiling(StepPhase.START) - atexit.register(python_profiler.stop_profiling, StepPhase.END) - return profiler_config_parser, python_profiler - - -def create_model(): - model = tf.keras.models.Sequential( - [ - # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 - tf.keras.layers.Flatten(input_shape=(28, 28, 1)), - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dropout(0.2), - tf.keras.layers.Dense(10, activation="softmax"), - ] - ) - return model - - -def helper_native_tf2_profiler_debugger(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with hook.wrap_tape(tf.GradientTape()) as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - # model.save(trial_dir, save_format="tf") - - trial = smd.create_trial(trial_dir) - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ - "weights/dense/kernel:0", - "weights/dense_1/kernel:0", - ] - assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ - "weights/dense/bias:0", - "weights/dense_1/bias:0", - ] - assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] - - -def helper_native_tf2_profiler(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with tf.GradientTape() as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -# @pytest.mark.parametrize("use_pyinstrument", [False, True]) -# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) -def test_native_python_profiling_cprofiler( - out_dir, tf2_python_cprofiler_config_parser_by_step -): - assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) - - config = profiler_config_parser.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) - print('\ntest function: ', python_profiler) - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_native_python_profiling_pyinstrument( - out_dir, tf2_python_pyinstrument_config_parser_by_step -): - assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = PYINSTRUMENT_NAME - allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_create_timeline_file(simple_profiler_config_parser, out_dir): - """ - This test is meant to test successful creation of the timeline file according to file path specification. - $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ - {$ENV_NODE_ID_4digits0padded}_pythontimeline.json - It reads backs the file contents to make sure it is in valid JSON format. - """ - assert simple_profiler_config_parser.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): - files.append(path) - - assert len(files) == 1 - - file_ts = files[0].name.split("_")[0] - folder_name = files[0].parent.name - assert folder_name == time.strftime( - TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) - ) - assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( - TRACE_DIRECTORY_FORMAT - ) - - with open(files[0]) as timeline_file: - events_dict = json.load(timeline_file) - - assert events_dict \ No newline at end of file From 63ea8d32c27c1181aed9a767ab8bd8592a804392 Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 15:25:40 -0700 Subject: [PATCH 35/97] Modify distributed_training_utils.py import for TF 2.4 (#422) --- smdebug/tensorflow/keras.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 72f15e9f0..e9a7a9635 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,10 +61,6 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -<<<<<<< HEAD -======= -# print('prezero-step start profiling object outside: ', python_profiler) ->>>>>>> update profiler for native tf training class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -148,7 +144,6 @@ def _is_not_supported(self): self._hook_supported = False elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: try: -<<<<<<< HEAD if is_tf_version_greater_than_2_4_x(): # distributed_training_utils.py renamed to distributed_training_utils_v1 in tf 2.4.0 from tensorflow.python.keras.distribute.distributed_training_utils_v1 import ( @@ -159,12 +154,6 @@ def _is_not_supported(self): get_distributed_model, ) -======= - # Third Party - from tensorflow.python.keras.distribute.distributed_training_utils import ( - get_distributed_model, - ) ->>>>>>> update profiler for native tf training except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( From c07b169f8b1a69894cd5dfbc6e1880d13e1f3ce6 Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 16:27:58 -0700 Subject: [PATCH 36/97] Cache TF Versions (#421) --- smdebug/tensorflow/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 11d700dfc..c26247229 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -428,7 +428,7 @@ def is_tf_version_greater_than_2_4_x(): def is_tf_version_greater_than_2_4_x(): - return version.parse("2.4.0") <= version.parse(tf.__version__) + return version.parse("2.4.0") <= TF_VERSION def is_profiler_supported_for_tf_version(): From 11af41058e11387f44f085dddd665d8eecafe1dd Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 19:48:15 -0800 Subject: [PATCH 37/97] remove print statement --- smdebug/tensorflow/keras.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e9a7a9635..4f79ea629 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1104,7 +1104,7 @@ def unwrap(func): def close(self): self._cleanup() - + # print("\nStep Number in the close function: ", self.step) if self.python_profiler: # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( @@ -1152,6 +1152,8 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() + # print("\nStep number in the push tape: ", self.step) + if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1242,6 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step + # print("\nStep number in the pop tape: ", self.step) return run @@ -1318,7 +1321,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - print("Step Number in start train batch: ", self.mode_steps[mode]) + # print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1380,6 +1383,7 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ + # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1412,6 +1416,7 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ + # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() From aa677560c968c76e2d40af6e89a1cdea9d1c9642 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 22:02:39 -0800 Subject: [PATCH 38/97] clean up the code --- smdebug/tensorflow/keras.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 4f79ea629..8485677a9 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1152,8 +1152,6 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - # print("\nStep number in the push tape: ", self.step) - if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1244,7 +1242,6 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - # print("\nStep number in the pop tape: ", self.step) return run @@ -1314,6 +1311,8 @@ def profiling_start_batch(self, mode): self.set_mode(mode) + # When only profiler is enabled in the native tf2 training, + # increasing the step number in the TRAIN and GLOBAL mode. if not self.debugger_native_training: self.step += 1 self.mode_steps[self.mode] += 1 @@ -1321,8 +1320,6 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - # print("Step Number in start train batch: ", self.mode_steps[mode]) - self.profiler_config_parser.load_config() if self.profiler_config_parser.should_save_metrics( @@ -1337,7 +1334,6 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: - # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1354,7 +1350,7 @@ def profiling_start_batch(self, mode): if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( @@ -1383,7 +1379,6 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1416,14 +1411,12 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_END_FLAG_FILENAME ): - # print("Stop Dataloader profiling") self.is_dataloader_profiling = False if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: From 6caa2b8a62bd950346045ab5dc1f412755e55ef0 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 23:22:21 -0800 Subject: [PATCH 39/97] clean up code --- smdebug/tensorflow/keras.py | 6 +----- smdebug/tensorflow/utils.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 8485677a9..267d71257 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -105,7 +105,6 @@ def __init__( # Profiling vars self.tf_profiler = None if is_profiler_supported_for_tf_version(): - # Third Party from tensorflow.python.profiler import profiler_v2 as tf_profiler self.tf_profiler = tf_profiler @@ -336,7 +335,6 @@ def _create_tensors_for_matching_collections( def _get_distributed_model(self, mode): # not available in tf 1.13, code shouldn't reach here for 1.13 # because of _is_not_supported - # Third Party from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) @@ -1104,9 +1102,8 @@ def unwrap(func): def close(self): self._cleanup() - # print("\nStep Number in the close function: ", self.step) + if self.python_profiler: - # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1154,7 +1151,6 @@ def run(*args, **kwargs): self._increment_step() if self._get_collections_to_save_for_step(): - # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() if self.last_saved_step is not None and self._exported_collections is False: diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index c26247229..af9a0e901 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -427,10 +427,6 @@ def is_tf_version_greater_than_2_4_x(): return version.parse("2.4.0") <= TF_VERSION -def is_tf_version_greater_than_2_4_x(): - return version.parse("2.4.0") <= TF_VERSION - - def is_profiler_supported_for_tf_version(): # Profiler Support Added For TF Versions 2.2.0 And Greater return version.parse("2.2.0") <= TF_VERSION From ce8c4506ccb7893892471921f44899e038ac6558 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 23:56:45 -0800 Subject: [PATCH 40/97] update format --- ...est_tf2_python_profiler_cprofiler_config_parser_by_step.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index d568b471c..e51c386c2 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} \ No newline at end of file +} From 861ce68dd10a161b4a2b461ce4e0ed5ee485eca1 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Thu, 21 Jan 2021 19:43:32 -0800 Subject: [PATCH 41/97] revise on PR --- smdebug/core/hook.py | 9 +- smdebug/tensorflow/keras.py | 286 ++++------ ...iler_all_params_config_parser_by_step.json | 8 - ...filer_cprofiler_config_parser_by_step.json | 7 - ...er_pyinstrument_config_parser_by_step.json | 7 - .../tensorflow2/test_native_tf2_profiler.py | 538 +++++++++--------- 6 files changed, 368 insertions(+), 487 deletions(-) delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 04fceb585..9d8df5dd6 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -558,18 +558,19 @@ def _cleanup(self): if self.first_process is True: remove_claim_file(self.out_dir) - def _increment_step(self): + def _increment_step(self, write_state=True): # Update the last_state to the last step number that was saved or seen - self._write_state() + if write_state: + self._write_state() + self.written_tensor_name_for_step.clear() + self._collections_to_save_for_step = None self.step += 1 self.mode_steps[self.mode] += 1 - self.written_tensor_name_for_step.clear() # Increment Global step number irrespective of what mode it is if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - self._collections_to_save_for_step = None # Called in the internal AWS codebase to determine # if a particular tensor value should be saved diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 267d71257..d5d200647 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -714,6 +714,94 @@ def _remove_fetches_and_callbacks(self, mode): x.fetch_callbacks.pop(tf_obj) self._fetches_added.clear() + def _start_phase_python_profiling(self, mode): + if self.python_profiler: + self.python_profiler.stop_profiling( + StepPhase.STEP_START, + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] + ): + self.python_profiler.start_profiling( + StepPhase.STEP_START, + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], + ) + + def _end_phase_python_profiling(self, mode): + if self.python_profiler: + self.python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] + ): + self.python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], + ) + + def _begin_detailed_profiling(self, mode=ModeKeys.TRAIN): + if is_profiler_supported_for_tf_version(): + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + ): + if not self.is_detailed_profiling: + self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( + self.profiler_config_parser.config.local_path, + "tensorflow", + self.mode_steps[mode], + ) + self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") + if not self.warm_up_completed: + # warming up profiler before it will be profiling. + self.tf_profiler.warmup() + self.warm_up_completed = True + self.tf_profiler.start(self._log_dir) + self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS + self.is_detailed_profiling = True + elif self.is_detailed_profiling: + self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + def _end_detailed_profiling(self): + if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: + self.logger.info("Disabling profiler, reached end of training.") + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + def _begin_dataloader_profiling(self, mode): + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] + ) and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_START_FLAG_FILENAME + ): + self.is_dataloader_profiling = True + elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + self.is_dataloader_profiling = False + + def _end_dataloader_profiling(self): + if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + self.is_dataloader_profiling = False + def on_epoch_begin(self, batch, logs=None): pass @@ -753,34 +841,12 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - self.python_profiler.start_profiling( - StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) - - if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False + self._end_phase_python_profiling(mode=mode) + self._end_detailed_profiling() def on_train_end(self, logs=None): self._on_any_mode_end(ModeKeys.TRAIN) - - if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: - self.logger.info("Disabling profiler, reached end of training.") - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False + self._end_detailed_profiling() # throws error in keras if this fn is absent def on_test_end(self, logs=None): @@ -826,31 +892,9 @@ def _on_any_batch_begin(self, batch, mode, logs=None): self.profiler_config_parser.load_config() - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] - ) and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_START_FLAG_FILENAME - ): - self.is_dataloader_profiling = True - elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False + self._begin_dataloader_profiling(mode=mode) - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) + self._start_phase_python_profiling(mode=mode) if self.prepared_collections is False: # sets prepared_collections to True here @@ -884,37 +928,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): def on_train_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.TRAIN, logs=logs) - - if is_profiler_supported_for_tf_version(): - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] - ): - if not self.is_detailed_profiling: - self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( - self.profiler_config_parser.config.local_path, - "tensorflow", - self.mode_steps[ModeKeys.TRAIN], - ) - self.logger.info( - f"Enabling TF profiler on step: = {self.mode_steps[ModeKeys.TRAIN]}" - ) - if not self.warm_up_completed: - # warming up profiler before it will be profiling. - self.tf_profiler.warmup() - self.warm_up_completed = True - self.tf_profiler.start(self._log_dir) - self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS - self.is_detailed_profiling = True - elif self.is_detailed_profiling: - self.logger.info( - f"Disabling TF profiler on step: ={self.mode_steps[ModeKeys.TRAIN]}" - ) - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False + self._begin_detailed_profiling() def on_test_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.EVAL, logs=logs) @@ -1007,20 +1021,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) + self._end_phase_python_profiling(mode=mode) def on_train_batch_end(self, batch, logs=None): self._on_any_batch_end(batch, ModeKeys.TRAIN, logs=logs) @@ -1296,7 +1297,7 @@ def record_tensor_value(self, tensor_name, tensor_value): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) - def profiling_start_batch(self, mode): + def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. """ @@ -1308,70 +1309,20 @@ def profiling_start_batch(self, mode): self.set_mode(mode) # When only profiler is enabled in the native tf2 training, - # increasing the step number in the TRAIN and GLOBAL mode. + # increasing the step number in the TRAIN and GLOBAL mode + # and not writing the state. if not self.debugger_native_training: - self.step += 1 - self.mode_steps[self.mode] += 1 - # Increment Global step number irrespective of what mode it is - if self.mode != ModeKeys.GLOBAL: - self.mode_steps[ModeKeys.GLOBAL] = self.step + self._increment_step(write_state=self.debugger_native_training) self.profiler_config_parser.load_config() - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] - ) and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_START_FLAG_FILENAME - ): - self.is_dataloader_profiling = True - elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False + self._begin_dataloader_profiling(mode=mode) - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) + self._start_phase_python_profiling(mode=mode) - if is_profiler_supported_for_tf_version(): - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] - ): - if not self.is_detailed_profiling: - self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( - self.profiler_config_parser.config.local_path, - "tensorflow", - self.mode_steps[mode], - ) - self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") - if not self.warm_up_completed: - # warming up profiler before it will be profiling. - self.tf_profiler.warmup() - self.warm_up_completed = True - self.tf_profiler.start(self._log_dir) - self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS - self.is_detailed_profiling = True - elif self.is_detailed_profiling: - self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False + self._begin_detailed_profiling(mode=mode) - def profiling_end_batch(self, mode): + def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. """ @@ -1388,20 +1339,7 @@ def profiling_end_batch(self, mode): step_num=str(self.mode_steps[mode]), ) - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) + self._end_phase_python_profiling(mode=mode) def profiling_end(self): """ @@ -1409,17 +1347,5 @@ def profiling_end(self): """ # Unwrap the tape before closing and close the python profiling self.close() - - if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False - - if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: - self.logger.info("Disabling profiler, reached end of training.") - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False + self._end_dataloader_profiling() + self._end_detailed_profiling() diff --git a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json deleted file mode 100644 index c119eebf8..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "DetailedProfilingConfig": "{\"StartStep\": 2, \"NumSteps\": 2}", - "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" - } -} diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json deleted file mode 100644 index e51c386c2..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" - } -} diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json deleted file mode 100644 index 53ac1485e..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" - } -} diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index be2b6e124..f3ddea198 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -32,49 +32,121 @@ from smdebug.tensorflow import KerasHook as Hook -@pytest.fixture() -def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") +@pytest.fixture +def profiler_config_path(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "profiler_config.json") monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_step_all_params(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_all_params_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - + yield config_path + if os.path.isfile(config_path): + os.remove(config_path) + + +def _convert_to_string(item): + return '"{0}"'.format(item) if isinstance(item, str) else item + + +def _convert_key_and_value(key, value): + return "{0}: {1}, ".format(_convert_to_string(key), _convert_to_string(value)) + + +def generate_profiler_config_parser(profiling_type, profiler_config_path, profiling_parameters): + python_profiler_config, detailed_profiler_config = "{}", "{}" + + if profiling_type == "PythonProfiling": + start_step, num_steps, profiler_name, cprofile_timer = profiling_parameters + python_profiler_config = "{" + if start_step is not None: + python_profiler_config += _convert_key_and_value("StartStep", start_step) + if num_steps is not None: + python_profiler_config += _convert_key_and_value("NumSteps", num_steps) + if profiler_name is not None: + python_profiler_config += _convert_key_and_value("ProfilerName", profiler_name) + if cprofile_timer is not None: + python_profiler_config += _convert_key_and_value("cProfileTimer", cprofile_timer) + python_profiler_config += "}" + + if profiling_type == "DetailedProfiling": + start_step, num_steps, start_time, duration = profiling_parameters + detailed_profiler_config = "{" + if start_step: + detailed_profiler_config += _convert_key_and_value("StartStep", start_step) + if num_steps: + detailed_profiler_config += _convert_key_and_value("NumSteps", num_steps) + if start_time: + detailed_profiler_config += _convert_key_and_value( + "StartTimeInSecSinceEpoch", start_time + ) + if duration: + detailed_profiler_config += _convert_key_and_value("DurationInSeconds", duration) + detailed_profiler_config += "}" + + full_config = { + "ProfilingParameters": { + "ProfilerEnabled": True, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": python_profiler_config, + "DetailedProfilingConfig": detailed_profiler_config, + } + } + + with open(profiler_config_path, "w") as f: + json.dump(full_config, f) + + profiler_config_parser = ProfilerConfigParser() + assert profiler_config_parser.profiling_enabled + + return profiler_config_parser + + +def generate_profiler_config_parser_all_params( + profiler_config_path, python_profiling_parameters, detailed_profiling_parameters +): -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig + start_step_1, num_steps_1, profiler_name, cprofile_timer = python_profiling_parameters + start_step_2, num_steps_2, start_time, duration = detailed_profiling_parameters + + python_profiler_config = "{" + if start_step_1 is not None: + python_profiler_config += _convert_key_and_value("StartStep", start_step_1) + if num_steps_1 is not None: + python_profiler_config += _convert_key_and_value("NumSteps", num_steps_1) + if profiler_name is not None: + python_profiler_config += _convert_key_and_value("ProfilerName", profiler_name) + if cprofile_timer is not None: + python_profiler_config += _convert_key_and_value("cProfileTimer", cprofile_timer) + python_profiler_config += "}" + + detailed_profiler_config = "{" + if start_step_2: + detailed_profiler_config += _convert_key_and_value("StartStep", start_step_2) + if num_steps_2: + detailed_profiler_config += _convert_key_and_value("NumSteps", num_steps_2) + if start_time: + detailed_profiler_config += _convert_key_and_value("StartTimeInSecSinceEpoch", start_time) + if duration: + detailed_profiler_config += _convert_key_and_value("DurationInSeconds", duration) + detailed_profiler_config += "}" + + full_config = { + "ProfilingParameters": { + "ProfilerEnabled": True, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": python_profiler_config, + "DetailedProfilingConfig": detailed_profiler_config, + } + } + + with open(profiler_config_path, "w") as f: + json.dump(full_config, f) + + profiler_config_parser = ProfilerConfigParser() + assert profiler_config_parser.profiling_enabled + + return profiler_config_parser + + +def set_up_profiling(profiler_config_parser): + profiler_config_parser = profiler_config_parser python_profiler = None if profiler_config_parser.profiling_enabled: config = profiler_config_parser.config @@ -123,17 +195,17 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - current_step = 0 + step = 0 n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: labels = tf.one_hot(labels, depth=10) if debugger: with hook.wrap_tape(tf.GradientTape()) as tape: - hook.profiling_start_batch(mode=smd.modes.TRAIN) + hook.profiling_start_batch() logits = train_step(data, labels) - if python_profiler and start_step <= current_step < end_step: - assert python_profiler._start_step == current_step + if python_profiler and start_step <= step < end_step: + assert python_profiler._start_step == step assert python_profiler._start_phase == StepPhase.STEP_START grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) @@ -142,33 +214,33 @@ def train_step(images, labels): hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) else: with tf.GradientTape() as tape: - hook.profiling_start_batch(mode=smd.modes.TRAIN) + hook.profiling_start_batch() logits = train_step(data, labels) - if python_profiler and start_step <= current_step < end_step: - assert python_profiler._start_step == current_step + if python_profiler and start_step <= step < end_step: + assert python_profiler._start_step == step assert python_profiler._start_phase == StepPhase.STEP_START grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - hook.profiling_end_batch(mode=smd.modes.TRAIN) + hook.profiling_end_batch() + if python_profiler and start_step <= step < end_step: + assert python_profiler._start_step == step + assert python_profiler._start_phase == StepPhase.STEP_END + step += 1 hook.profiling_end() + if python_profiler: + assert python_profiler._start_step == step - 1 + assert python_profiler._start_phase == StepPhase.STEP_END -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): +def verify_num_trace_events(profilerconfig): """ - This test executes a TF2 native training script with profiler, enables detailed TF profiling by step, and - verifies the number of events. + This verifies the number of events when detailed profiling is enabled. """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - t_events = TensorboardProfilerEvents() # get tensorboard timeline files files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + for path in Path(profilerconfig.config.local_path + "/framework").rglob( f"*{TENSORBOARDTIMELINE_SUFFIX}" ): files.append(path) @@ -188,162 +260,38 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step assert num_trace_events >= 230 -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script with profiler, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -@pytest.mark.skip_if_non_eager -def test_native_python_profiling_cprofiler(out_dir, tf2_python_cprofiler_config_parser_by_step): - """ - This test executes a TF2 native training script with profiler, enables cprofiler by step, and - verifies the python profiling's steps and expected output files. - """ - assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling( - tf2_python_cprofiler_config_parser_by_step - ) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape( - hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step - ) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - # Since python_profiler.stop_profiling for the posthookclose step automatically executed - # upon normal interpreter termination, - # the number of the files is (end_step - start_step) * 2 + 2 - 1. - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) +def train_loop(out_dir, debugger=False): + hook = Hook(out_dir=out_dir, save_all=True) + helper_native_tf2_gradtape(hook=hook, debugger=debugger) -@pytest.mark.skip_if_non_eager -def test_native_python_profiling_pyinstrument( - out_dir, tf2_python_pyinstrument_config_parser_by_step -): +def verify_tensor_names(out_dir): """ - This test executes a TF2 native training script with profiler, enables pyinstrument by step, and - verifies the python profiling's steps and expected output files. + This verifies the tensor names when debugger is enabled. """ - assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling( - tf2_python_pyinstrument_config_parser_by_step - ) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = PYINSTRUMENT_NAME - allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape( - hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step - ) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - # Since python_profiler.stop_profiling for the posthookclose step automatically executed - # upon normal interpreter termination, - # the number of the files is (end_step - start_step) * 2 + 2 - 1. - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) + trial = smd.create_trial(out_dir) + assert len(trial.steps()) > 0, "Nothing saved at any step." + assert len(trial.tensor_names()) > 0, "Tensors were not saved." + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 + assert trial.tensor_names(collection="optimizer_variables") == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -@pytest.mark.skip_if_non_eager -def test_create_timeline_file(simple_profiler_config_parser, out_dir): +def verify_timeline_file(out_dir): """ - This test is to test the creation of the timeline file according to file path specification. + This verifies the creation of the timeline file according to file path specification. It reads backs the file contents to make sure it is in valid JSON format. """ - assert simple_profiler_config_parser.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - files = [] for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): files.append(path) @@ -365,40 +313,47 @@ def test_create_timeline_file(simple_profiler_config_parser, out_dir): assert events_dict -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_debugger_all_params( - tf2_profiler_config_parser_by_step_all_params, out_dir -): +def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=False): """ - This test executes a TF2 native training script with debugger and profiler, enables detailed TF profiling, python - profiling by step. + This executes a TF2 native training script with profiler or both profiler and debugger, + enables python profiling by step, and verifies the python profiling's steps and expected output files. """ - assert tf2_profiler_config_parser_by_step_all_params.profiling_enabled + assert profilerconfig.profiling_enabled - profiler_config_parser, python_profiler = set_up_profiling( - tf2_profiler_config_parser_by_step_all_params - ) + profiler_config_parser, python_profiler = set_up_profiling(profilerconfig) config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) + if profiler_name == CPROFILE_NAME: + allowed_files = [CPROFILE_STATS_FILENAME] + if profiler_name == PYINSTRUMENT_NAME: + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) hook = Hook(out_dir=out_dir, save_all=True) hook.python_profiler = python_profiler - helper_native_tf2_gradtape(hook=hook, debugger=True) + helper_native_tf2_gradtape( + hook=hook, + python_profiler=python_profiler, + start_step=start_step, + end_step=end_step, + debugger=debugger, + ) - # Verifying python profiling related files. + # Test that directory and corresponding files exist. assert os.path.isdir(python_stats_dir) for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is num_steps * 2 + 2 - 1. + assert len(stats_dirs) == num_steps * 2 + 1 for stats_dir in stats_dirs: # Validate that the expected files are in the stats dir @@ -415,64 +370,85 @@ def test_native_tf2_profiler_debugger_all_params( with open(stats_path, "r") as f: assert json.load(f) - # Verifying detailed TF profiling. - t_events = TensorboardProfilerEvents() - # get tensorboard timeline files - files = [] - for path in Path( - tf2_profiler_config_parser_by_step_all_params.config.local_path + "/framework" - ).rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - # Verifying timeline files. - files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): - files.append(path) - - assert len(files) == 1 - - file_ts = files[0].name.split("_")[0] - folder_name = files[0].parent.name - assert folder_name == time.strftime( - TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) - ) - assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( - TRACE_DIRECTORY_FORMAT - ) - - with open(files[0]) as timeline_file: - events_dict = json.load(timeline_file) - - assert events_dict - - # Verifying tensor names. - trial = smd.create_trial(out_dir) - assert len(trial.steps()) > 0, "Nothing saved at any step." - assert len(trial.tensor_names()) > 0, "Tensors were not saved." - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 - assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 - assert trial.tensor_names(collection="optimizer_variables") == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] +@pytest.mark.skip_if_non_eager +@pytest.mark.parametrize("enable_detailed_profiling", [False, True]) +@pytest.mark.parametrize("enable_python_profiling", [False, CPROFILE_NAME, PYINSTRUMENT_NAME]) +@pytest.mark.parametrize("enable_debugger", [False, True]) +def test_native_tf2_profiling_debugger( + enable_detailed_profiling, + enable_python_profiling, + enable_debugger, + profiler_config_path, + out_dir, +): + if not enable_debugger: + if enable_detailed_profiling and not enable_python_profiling: + profiler_config_parser = generate_profiler_config_parser( + "DetailedProfiling", profiler_config_path, (8, 4, None, None) + ) + train_loop(out_dir) + verify_num_trace_events(profiler_config_parser) + verify_timeline_file(out_dir) + elif not enable_detailed_profiling and enable_python_profiling: + if enable_python_profiling == CPROFILE_NAME: + profiler_config_parser = generate_profiler_config_parser( + "PythonProfiling", profiler_config_path, (5, 2, CPROFILE_NAME, None) + ) + verify_python_profiling(CPROFILE_NAME, out_dir, profiler_config_parser) + verify_timeline_file(out_dir) + if enable_python_profiling == PYINSTRUMENT_NAME: + profiler_config_parser = generate_profiler_config_parser( + "PythonProfiling", profiler_config_path, (10, 3, PYINSTRUMENT_NAME, None) + ) + verify_python_profiling(PYINSTRUMENT_NAME, out_dir, profiler_config_parser) + verify_timeline_file(out_dir) + elif enable_detailed_profiling and enable_python_profiling: + profiler_config_parser = generate_profiler_config_parser_all_params( + profiler_config_path, (4, 2, enable_python_profiling, None), (8, 1, None, None) + ) + verify_python_profiling(enable_python_profiling, out_dir, profiler_config_parser) + verify_num_trace_events(profiler_config_parser) + verify_timeline_file(out_dir) + else: + pass + else: + if enable_detailed_profiling and not enable_python_profiling: + profiler_config_parser = generate_profiler_config_parser( + "DetailedProfiling", profiler_config_path, (8, 4, None, None) + ) + train_loop(out_dir, debugger=True) + verify_num_trace_events(profiler_config_parser) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) + elif not enable_detailed_profiling and enable_python_profiling: + if enable_python_profiling == CPROFILE_NAME: + profiler_config_parser = generate_profiler_config_parser( + "PythonProfiling", profiler_config_path, (5, 2, CPROFILE_NAME, None) + ) + verify_python_profiling( + CPROFILE_NAME, out_dir, profiler_config_parser, debugger=True + ) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) + if enable_python_profiling == PYINSTRUMENT_NAME: + profiler_config_parser = generate_profiler_config_parser( + "PythonProfiling", profiler_config_path, (10, 3, PYINSTRUMENT_NAME, None) + ) + verify_python_profiling( + PYINSTRUMENT_NAME, out_dir, profiler_config_parser, debugger=True + ) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) + elif enable_detailed_profiling and enable_python_profiling: + profiler_config_parser = generate_profiler_config_parser_all_params( + profiler_config_path, (4, 2, enable_python_profiling, None), (8, 1, None, None) + ) + verify_python_profiling( + enable_python_profiling, out_dir, profiler_config_parser, debugger=True + ) + verify_num_trace_events(profiler_config_parser) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) + else: + pass From 6ea117cb7c7977f8f054711432ee4994048f0174 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Fri, 22 Jan 2021 00:13:07 -0800 Subject: [PATCH 42/97] update _on_any_mode_end() func for the posthookclose python profiling --- smdebug/tensorflow/keras.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index be88e4da8..482058a3f 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -841,7 +841,18 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): - self._end_phase_python_profiling(mode=mode) + if self.python_profiler: + self.python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], + ) + self.python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], + ) + self._end_dataloader_profiling() def on_train_end(self, logs=None): From a59d82b3b9877f0bad33e706d316284a3966581c Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Fri, 22 Jan 2021 11:11:03 -0800 Subject: [PATCH 43/97] rename the debugger native training flag and update the path join in the unit test --- smdebug/tensorflow/keras.py | 10 +++++----- tests/profiler/tensorflow2/test_native_tf2_profiler.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 482058a3f..eb8324897 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -123,7 +123,7 @@ def __init__( # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False # this flag indicates to debugging for tensorflow2 native training - self.debugger_native_training = False + self.is_debugger_enabled_for_native_training = False if self.python_profiler: atexit.register(self.python_profiler.stop_profiling, StepPhase.END) @@ -1121,7 +1121,7 @@ def close(self): start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - self.debugger_native_training = False + self.is_debugger_enabled_for_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1278,7 +1278,7 @@ def wrap_tape(self, tape): """ from tensorflow.python.eager.backprop import GradientTape - self.debugger_native_training = True + self.is_debugger_enabled_for_native_training = True self.set_mode(ModeKeys.TRAIN) if isinstance(tape, GradientTape): @@ -1322,8 +1322,8 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): # When only profiler is enabled in the native tf2 training, # increasing the step number in the TRAIN and GLOBAL mode # and not writing the state. - if not self.debugger_native_training: - self._increment_step(write_state=self.debugger_native_training) + if not self.is_debugger_enabled_for_native_training: + self._increment_step(write_state=self.is_debugger_enabled_for_native_training) self.profiler_config_parser.load_config() diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index 79b0e52b9..56a76b907 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -241,7 +241,7 @@ def verify_num_trace_events(profilerconfig): # get tensorboard timeline files files = [] - for path in Path(profilerconfig.config.local_path + "/framework").rglob( + for path in Path(os.path.join(profilerconfig.config.local_path + "/framework")).rglob( f"*{TENSORBOARDTIMELINE_SUFFIX}" ): files.append(path) @@ -294,7 +294,7 @@ def verify_timeline_file(out_dir): It reads backs the file contents to make sure it is in valid JSON format. """ files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + for path in Path(os.path.join(out_dir + "/" + DEFAULT_PREFIX)).rglob("*.json"): files.append(path) assert len(files) == 1 From 94ea4f3e98443e4b44e511965080689df3e205fb Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 25 Jan 2021 13:48:01 -0800 Subject: [PATCH 44/97] update format --- smdebug/tensorflow/keras.py | 29 +++++++++++++++---- .../tensorflow2/test_native_tf2_profiler.py | 3 +- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index eb8324897..81a55b863 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -123,7 +123,9 @@ def __init__( # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False # this flag indicates to debugging for tensorflow2 native training - self.is_debugger_enabled_for_native_training = False + # self.is_debugger_enabled_for_native_training = False + self.is_profiler_enabled_for_native_training = False + # self.step_incremented_in_profiling_begin = False if self.python_profiler: atexit.register(self.python_profiler.stop_profiling, StepPhase.END) @@ -714,6 +716,16 @@ def _remove_fetches_and_callbacks(self, mode): x.fetch_callbacks.pop(tf_obj) self._fetches_added.clear() + def _decrement_step(self): + # Called when both profiler and debugger are enabled in the native training loop + # to adjust the step number + self.step -= 1 + self.mode_steps[self.mode] -= 1 + + # Increment Global step number irrespective of what mode it is + if self.mode != ModeKeys.GLOBAL: + self.mode_steps[ModeKeys.GLOBAL] = self.step + def _start_phase_python_profiling(self, mode): if self.python_profiler: self.python_profiler.stop_profiling( @@ -1121,7 +1133,7 @@ def close(self): start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - self.is_debugger_enabled_for_native_training = False + self.is_profiler_enabled_for_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1278,9 +1290,14 @@ def wrap_tape(self, tape): """ from tensorflow.python.eager.backprop import GradientTape - self.is_debugger_enabled_for_native_training = True self.set_mode(ModeKeys.TRAIN) + # When both profiler and debugger are enabled in the native training, step number is firstly increased by 1 in + # the profiling_start_batch() function, and should be decreased by 1 here in order to keep the step number + # correct when calling _increment_step() function inside _wrap_push_tape() function. + if self.is_profiler_enabled_for_native_training: + self._decrement_step() + if isinstance(tape, GradientTape): # unwrap tape before wrapping new tape to avoid recursive wrap tapes if self.tape: @@ -1319,11 +1336,13 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self.set_mode(mode) + self.is_profiler_enabled_for_native_training = True + # When only profiler is enabled in the native tf2 training, # increasing the step number in the TRAIN and GLOBAL mode # and not writing the state. - if not self.is_debugger_enabled_for_native_training: - self._increment_step(write_state=self.is_debugger_enabled_for_native_training) + if self.is_profiler_enabled_for_native_training: + self._increment_step(write_state=False) self.profiler_config_parser.load_config() diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index 56a76b907..b766df349 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -200,9 +200,9 @@ def train_step(images, labels): for epoch in range(n_epochs): for data, labels in dataset: labels = tf.one_hot(labels, depth=10) + hook.profiling_start_batch() if debugger: with hook.wrap_tape(tf.GradientTape()) as tape: - hook.profiling_start_batch() logits = train_step(data, labels) if python_profiler and start_step <= current_step < end_step: assert python_profiler._start_step == current_step @@ -214,7 +214,6 @@ def train_step(images, labels): hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) else: with tf.GradientTape() as tape: - hook.profiling_start_batch() logits = train_step(data, labels) if python_profiler and start_step <= current_step < end_step: assert python_profiler._start_step == current_step From 28a78825db36edecd9f451d8abf8a5532831b014 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 25 Jan 2021 14:27:55 -0800 Subject: [PATCH 45/97] update the comments --- smdebug/tensorflow/keras.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 81a55b863..feffa284d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -122,10 +122,8 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag indicates to debugging for tensorflow2 native training - # self.is_debugger_enabled_for_native_training = False + # this flag indicates to profiler for tensorflow2 native training self.is_profiler_enabled_for_native_training = False - # self.step_incremented_in_profiling_begin = False if self.python_profiler: atexit.register(self.python_profiler.stop_profiling, StepPhase.END) From 015aca12dd6f79fc53bad56b04eb657f03f2d064 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 25 Jan 2021 15:00:00 -0800 Subject: [PATCH 46/97] update comments --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index feffa284d..3adbc29e5 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -122,7 +122,7 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag indicates to profiler for tensorflow2 native training + # this flag indicates to profiling for tensorflow2 native training self.is_profiler_enabled_for_native_training = False if self.python_profiler: From a35fae3bab7bea81fa7abe753d9b91de1e2f4e44 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 25 Jan 2021 21:13:36 -0800 Subject: [PATCH 47/97] add docstring, update helper function names and improve the unit tests --- smdebug/tensorflow/keras.py | 86 ++++++++----- .../tensorflow2/test_native_tf2_profiler.py | 114 +++++++++++------- 2 files changed, 126 insertions(+), 74 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 3adbc29e5..5a21ad5cb 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -715,16 +715,25 @@ def _remove_fetches_and_callbacks(self, mode): self._fetches_added.clear() def _decrement_step(self): - # Called when both profiler and debugger are enabled in the native training loop - # to adjust the step number + """ + Called when both profiler and debugger are enabled in the native training loop + to adjust the step number + """ + self.step -= 1 self.mode_steps[self.mode] -= 1 - # Increment Global step number irrespective of what mode it is + # Decrease Global step number irrespective of what mode it is if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - def _start_phase_python_profiling(self, mode): + def _handle_start_python_profiling(self, mode): + """ + This function is called to handle python profiling at the start of a step. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: + """ + if self.python_profiler: self.python_profiler.stop_profiling( StepPhase.STEP_START, @@ -740,7 +749,12 @@ def _start_phase_python_profiling(self, mode): start_step=self.mode_steps[mode], ) - def _end_phase_python_profiling(self, mode): + def _handle_end_python_profiling(self, mode): + """ + This function is called to handle python profiling at the end of a step. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: + """ if self.python_profiler: self.python_profiler.stop_profiling( StepPhase.STEP_END, @@ -756,7 +770,12 @@ def _end_phase_python_profiling(self, mode): start_step=self.mode_steps[mode], ) - def _begin_detailed_profiling(self, mode=ModeKeys.TRAIN): + def _handle_start_detailed_profiling(self, mode=ModeKeys.TRAIN): + """ + This function is called to handle detailed profiling at the start of a step. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: + """ if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] @@ -784,7 +803,10 @@ def _begin_detailed_profiling(self, mode=ModeKeys.TRAIN): ) self.is_detailed_profiling = False - def _end_detailed_profiling(self): + def _handle_end_detailed_profiling(self): + """ + This function is called to handle detailed profiling at the end of a step. + """ if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: self.logger.info("Disabling profiler, reached end of training.") stop_tf_profiler( @@ -794,7 +816,12 @@ def _end_detailed_profiling(self): ) self.is_detailed_profiling = False - def _begin_dataloader_profiling(self, mode): + def _handle_start_dataloader_profiling(self, mode): + """ + This function is called to handle dataloader profiling at the start of a step. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: + """ if self.profiler_config_parser.should_save_metrics( MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] ) and self.profiler_config_parser.write_tf_dataloader_flag( @@ -806,7 +833,10 @@ def _begin_dataloader_profiling(self, mode): ): self.is_dataloader_profiling = False - def _end_dataloader_profiling(self): + def _handle_end_dataloader_profiling(self): + """ + This function is called to handle dataloader profiling at the end of a step. + """ if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_END_FLAG_FILENAME ): @@ -863,11 +893,11 @@ def _on_any_mode_end(self, mode): start_step=self.mode_steps[mode], ) - self._end_dataloader_profiling() + self._handle_end_dataloader_profiling() def on_train_end(self, logs=None): self._on_any_mode_end(ModeKeys.TRAIN) - self._end_detailed_profiling() + self._handle_end_detailed_profiling() # throws error in keras if this fn is absent def on_test_end(self, logs=None): @@ -912,10 +942,8 @@ def _on_any_batch_begin(self, batch, mode, logs=None): self.step_incremented_in_on_train_begin = False self.profiler_config_parser.load_config() - - self._begin_dataloader_profiling(mode=mode) - - self._start_phase_python_profiling(mode=mode) + self._handle_start_dataloader_profiling(mode=mode) + self._handle_start_python_profiling(mode=mode) if self.prepared_collections is False: # sets prepared_collections to True here @@ -949,7 +977,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): def on_train_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.TRAIN, logs=logs) - self._begin_detailed_profiling() + self._handle_start_detailed_profiling() def on_test_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.EVAL, logs=logs) @@ -1042,7 +1070,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True - self._end_phase_python_profiling(mode=mode) + self._handle_end_python_profiling(mode=mode) def on_train_batch_end(self, batch, logs=None): self._on_any_batch_end(batch, ModeKeys.TRAIN, logs=logs) @@ -1326,6 +1354,8 @@ def record_tensor_value(self, tensor_name, tensor_value): def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: """ self.start = time.time() @@ -1333,26 +1363,22 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): return self.set_mode(mode) - self.is_profiler_enabled_for_native_training = True # When only profiler is enabled in the native tf2 training, # increasing the step number in the TRAIN and GLOBAL mode # and not writing the state. - if self.is_profiler_enabled_for_native_training: - self._increment_step(write_state=False) - + self._increment_step(write_state=False) self.profiler_config_parser.load_config() - - self._begin_dataloader_profiling(mode=mode) - - self._start_phase_python_profiling(mode=mode) - - self._begin_detailed_profiling(mode=mode) + self._handle_start_dataloader_profiling(mode=mode) + self._handle_start_python_profiling(mode=mode) + self._handle_start_detailed_profiling(mode=mode) def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: """ if self._is_not_supported(): return @@ -1366,7 +1392,7 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): pid=os.getpid(), step_num=str(self.mode_steps[mode]), ) - self._end_phase_python_profiling(mode=mode) + self._handle_end_python_profiling(mode=mode) def profiling_end(self): """ @@ -1374,5 +1400,5 @@ def profiling_end(self): """ # Unwrap the tape before closing and close the python profiling self.close() - self._end_dataloader_profiling() - self._end_detailed_profiling() + self._handle_end_dataloader_profiling() + self._handle_end_detailed_profiling() diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index b766df349..081f70bfc 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -145,8 +145,8 @@ def generate_profiler_config_parser_all_params( return profiler_config_parser -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig +def set_up_profiling(profiler_config): + profiler_config_parser = profiler_config python_profiler = None if profiler_config_parser.profiling_enabled: config = profiler_config_parser.config @@ -199,8 +199,8 @@ def train_step(images, labels): n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) hook.profiling_start_batch() + labels = tf.one_hot(labels, depth=10) if debugger: with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) @@ -231,7 +231,26 @@ def train_step(images, labels): assert python_profiler._start_phase == StepPhase.STEP_END -def verify_num_trace_events(profilerconfig): +def initiate_python_profiling(profiler_config): + assert profiler_config.profiling_enabled + profiler_config_parser, python_profiler = set_up_profiling(profiler_config) + config = profiler_config_parser.config + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + return python_profiler, start_step, end_step + + +def train_loop(out_dir, debugger=False, python_profiler=None, start_step=None, end_step=None): + hook = Hook(out_dir=out_dir, save_all=True) + if python_profiler: + hook.python_profiler = python_profiler + helper_native_tf2_gradtape( + hook=hook, debugger=debugger, start_step=start_step, end_step=end_step + ) + + +def verify_num_trace_events(profiler_config): """ This verifies the number of events when detailed profiling is enabled. """ @@ -240,7 +259,7 @@ def verify_num_trace_events(profilerconfig): # get tensorboard timeline files files = [] - for path in Path(os.path.join(profilerconfig.config.local_path + "/framework")).rglob( + for path in Path(os.path.join(profiler_config.config.local_path + "/framework")).rglob( f"*{TENSORBOARDTIMELINE_SUFFIX}" ): files.append(path) @@ -260,11 +279,6 @@ def verify_num_trace_events(profilerconfig): assert num_trace_events >= 230 -def train_loop(out_dir, debugger=False): - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_gradtape(hook=hook, debugger=debugger) - - def verify_tensor_names(out_dir): """ This verifies the tensor names when debugger is enabled. @@ -313,19 +327,11 @@ def verify_timeline_file(out_dir): assert events_dict -def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=False): +def verify_python_profiling(profiler_name, out_dir, num_steps): """ This executes a TF2 native training script with profiler or both profiler and debugger, enables python profiling by step, and verifies the python profiling's steps and expected output files. """ - assert profilerconfig.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(profilerconfig) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps if profiler_name == CPROFILE_NAME: allowed_files = [CPROFILE_STATS_FILENAME] @@ -334,15 +340,6 @@ def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=Fal allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - hook = Hook(out_dir=out_dir, save_all=True) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape( - hook=hook, - python_profiler=python_profiler, - start_step=start_step, - end_step=end_step, - debugger=debugger, - ) assert os.path.isdir(python_stats_dir) @@ -395,19 +392,33 @@ def test_native_tf2_profiling_debugger( profiler_config_parser = generate_profiler_config_parser( "PythonProfiling", profiler_config_path, (5, 2, CPROFILE_NAME, None) ) - verify_python_profiling(CPROFILE_NAME, out_dir, profiler_config_parser) - verify_timeline_file(out_dir) if enable_python_profiling == PYINSTRUMENT_NAME: profiler_config_parser = generate_profiler_config_parser( "PythonProfiling", profiler_config_path, (10, 3, PYINSTRUMENT_NAME, None) ) - verify_python_profiling(PYINSTRUMENT_NAME, out_dir, profiler_config_parser) - verify_timeline_file(out_dir) + python_profiler, start_step, end_step = initiate_python_profiling( + profiler_config_parser + ) + train_loop( + out_dir, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) + verify_python_profiling( + enable_python_profiling, out_dir, num_steps=end_step - start_step + ) + verify_timeline_file(out_dir) elif enable_detailed_profiling and enable_python_profiling: profiler_config_parser = generate_profiler_config_parser_all_params( profiler_config_path, (4, 2, enable_python_profiling, None), (8, 1, None, None) ) - verify_python_profiling(enable_python_profiling, out_dir, profiler_config_parser) + python_profiler, start_step, end_step = initiate_python_profiling( + profiler_config_parser + ) + train_loop( + out_dir, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) + verify_python_profiling( + enable_python_profiling, out_dir, num_steps=end_step - start_step + ) verify_num_trace_events(profiler_config_parser) verify_timeline_file(out_dir) else: @@ -426,26 +437,41 @@ def test_native_tf2_profiling_debugger( profiler_config_parser = generate_profiler_config_parser( "PythonProfiling", profiler_config_path, (5, 2, CPROFILE_NAME, None) ) - verify_python_profiling( - CPROFILE_NAME, out_dir, profiler_config_parser, debugger=True - ) - verify_timeline_file(out_dir) - verify_tensor_names(out_dir) if enable_python_profiling == PYINSTRUMENT_NAME: profiler_config_parser = generate_profiler_config_parser( "PythonProfiling", profiler_config_path, (10, 3, PYINSTRUMENT_NAME, None) ) - verify_python_profiling( - PYINSTRUMENT_NAME, out_dir, profiler_config_parser, debugger=True - ) - verify_timeline_file(out_dir) - verify_tensor_names(out_dir) + python_profiler, start_step, end_step = initiate_python_profiling( + profiler_config_parser + ) + train_loop( + out_dir, + debugger=True, + python_profiler=python_profiler, + start_step=start_step, + end_step=end_step, + ) + verify_python_profiling( + enable_python_profiling, out_dir, num_steps=end_step - start_step + ) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) elif enable_detailed_profiling and enable_python_profiling: profiler_config_parser = generate_profiler_config_parser_all_params( profiler_config_path, (4, 2, enable_python_profiling, None), (8, 1, None, None) ) + python_profiler, start_step, end_step = initiate_python_profiling( + profiler_config_parser + ) + train_loop( + out_dir, + debugger=True, + python_profiler=python_profiler, + start_step=start_step, + end_step=end_step, + ) verify_python_profiling( - enable_python_profiling, out_dir, profiler_config_parser, debugger=True + enable_python_profiling, out_dir, num_steps=end_step - start_step ) verify_num_trace_events(profiler_config_parser) verify_timeline_file(out_dir) From d72cd1e02e2f80d8ec7f7899a6045b9b97adc302 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 26 Jan 2021 16:54:06 -0800 Subject: [PATCH 48/97] update docstring and function name --- smdebug/tensorflow/keras.py | 56 +++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 5a21ad5cb..d40b58d98 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -723,15 +723,14 @@ def _decrement_step(self): self.step -= 1 self.mode_steps[self.mode] -= 1 - # Decrease Global step number irrespective of what mode it is + # Decrement Global step number irrespective of what mode it is if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - def _handle_start_python_profiling(self, mode): + def _handle_start_step_python_profiling(self, mode): """ This function is called to handle python profiling at the start of a step. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if self.python_profiler: @@ -749,11 +748,10 @@ def _handle_start_python_profiling(self, mode): start_step=self.mode_steps[mode], ) - def _handle_end_python_profiling(self, mode): + def _handle_end_step_python_profiling(self, mode): """ This function is called to handle python profiling at the end of a step. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if self.python_profiler: self.python_profiler.stop_profiling( @@ -770,11 +768,10 @@ def _handle_end_python_profiling(self, mode): start_step=self.mode_steps[mode], ) - def _handle_start_detailed_profiling(self, mode=ModeKeys.TRAIN): + def _handle_detailed_profiling(self, mode=ModeKeys.TRAIN): """ - This function is called to handle detailed profiling at the start of a step. + This function is called to handle detailed profiling at the start of a mode. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( @@ -803,9 +800,9 @@ def _handle_start_detailed_profiling(self, mode=ModeKeys.TRAIN): ) self.is_detailed_profiling = False - def _handle_end_detailed_profiling(self): + def _stop_detailed_profiling(self): """ - This function is called to handle detailed profiling at the end of a step. + This function is called to stop detailed profiling at the end of a mode. """ if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: self.logger.info("Disabling profiler, reached end of training.") @@ -816,11 +813,10 @@ def _handle_end_detailed_profiling(self): ) self.is_detailed_profiling = False - def _handle_start_dataloader_profiling(self, mode): + def _handle_dataloader_profiling(self, mode): """ - This function is called to handle dataloader profiling at the start of a step. + This function is called to handle dataloader profiling at the start of a mode. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if self.profiler_config_parser.should_save_metrics( MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] @@ -833,9 +829,9 @@ def _handle_start_dataloader_profiling(self, mode): ): self.is_dataloader_profiling = False - def _handle_end_dataloader_profiling(self): + def _stop_dataloader_profiling(self): """ - This function is called to handle dataloader profiling at the end of a step. + This function is called to stop dataloader profiling at the end of a mode. """ if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_END_FLAG_FILENAME @@ -893,11 +889,11 @@ def _on_any_mode_end(self, mode): start_step=self.mode_steps[mode], ) - self._handle_end_dataloader_profiling() + self._stop_dataloader_profiling() def on_train_end(self, logs=None): self._on_any_mode_end(ModeKeys.TRAIN) - self._handle_end_detailed_profiling() + self._stop_detailed_profiling() # throws error in keras if this fn is absent def on_test_end(self, logs=None): @@ -942,8 +938,8 @@ def _on_any_batch_begin(self, batch, mode, logs=None): self.step_incremented_in_on_train_begin = False self.profiler_config_parser.load_config() - self._handle_start_dataloader_profiling(mode=mode) - self._handle_start_python_profiling(mode=mode) + self._handle_dataloader_profiling(mode=mode) + self._handle_start_step_python_profiling(mode=mode) if self.prepared_collections is False: # sets prepared_collections to True here @@ -977,7 +973,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): def on_train_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.TRAIN, logs=logs) - self._handle_start_detailed_profiling() + self._handle_detailed_profiling() def on_test_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.EVAL, logs=logs) @@ -1070,7 +1066,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True - self._handle_end_python_profiling(mode=mode) + self._handle_end_step_python_profiling(mode=mode) def on_train_batch_end(self, batch, logs=None): self._on_any_batch_end(batch, ModeKeys.TRAIN, logs=logs) @@ -1318,9 +1314,9 @@ def wrap_tape(self, tape): self.set_mode(ModeKeys.TRAIN) - # When both profiler and debugger are enabled in the native training, step number is firstly increased by 1 in + # When both profiler and debugger are enabled in the native training, step number is increased by 1 in # the profiling_start_batch() function, and should be decreased by 1 here in order to keep the step number - # correct when calling _increment_step() function inside _wrap_push_tape() function. + # consistent since _increment_step() will be called in wrap_push_tape(). if self.is_profiler_enabled_for_native_training: self._decrement_step() @@ -1355,7 +1351,6 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ self.start = time.time() @@ -1370,15 +1365,14 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): # and not writing the state. self._increment_step(write_state=False) self.profiler_config_parser.load_config() - self._handle_start_dataloader_profiling(mode=mode) - self._handle_start_python_profiling(mode=mode) - self._handle_start_detailed_profiling(mode=mode) + self._handle_dataloader_profiling(mode=mode) + self._handle_start_step_python_profiling(mode=mode) + self._handle_detailed_profiling(mode=mode) def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if self._is_not_supported(): return @@ -1392,7 +1386,7 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): pid=os.getpid(), step_num=str(self.mode_steps[mode]), ) - self._handle_end_python_profiling(mode=mode) + self._handle_end_step_python_profiling(mode=mode) def profiling_end(self): """ @@ -1400,5 +1394,5 @@ def profiling_end(self): """ # Unwrap the tape before closing and close the python profiling self.close() - self._handle_end_dataloader_profiling() - self._handle_end_detailed_profiling() + self._stop_dataloader_profiling() + self._stop_detailed_profiling() From 4a7b74b6902112fd6c6639a339d912729fa9d0e0 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:32:23 -0800 Subject: [PATCH 49/97] add changes for enabling profiling for native tf2 training --- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py new file mode 100644 index 000000000..e69de29bb From 440f0b1850d12038c4e5a9857c98eb26545cdeb8 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:42:55 -0800 Subject: [PATCH 50/97] add changes for enabling profiling in the native tf2 training --- smdebug/tensorflow/keras.py | 169 +++++++++++++++++- .../tensorflow2/test_native_tf2_profiling.py | 144 +++++++++++++++ 2 files changed, 305 insertions(+), 8 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 49a87419e..1c0ee5805 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -121,6 +121,9 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False + # this flag is used to handle step number increment in the tensorflow native training + # it indicated to profiling for tensorflow2 native training + self.profiling_native_training = False if python_profiler: atexit.register(python_profiler.stop_profiling, StepPhase.END) @@ -1099,7 +1102,9 @@ def unwrap(func): def close(self): self._cleanup() + print('\nStep Number in the close function: ', self.step) if python_profiler: + print('python profiling for end of last train step to end of training') python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1143,7 +1148,9 @@ def run(*args, **kwargs): self._prepare_collections() self.prepared_collections = True - self._increment_step() + if not self.profiling_native_training: + self._increment_step() + print('\nStep number in the push tape: ', self.step) if self._get_collections_to_save_for_step(): self._initialize_writers() @@ -1233,6 +1240,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step + print('\nStep number in the pop tape: ', self.step) return run @@ -1259,13 +1267,13 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # Disable python profiling, because now we are starting wrap tape. - if python_profiler: - python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(self.mode), - end_step=0, - ) + # # Disable python profiling, because now we are starting wrap tape. + # if python_profiler: + # python_profiler.stop_profiling( + # StepPhase.STEP_START, + # end_mode=mode_keys_to_python_profile_mode(self.mode), + # end_step=0, + # ) from tensorflow.python.eager.backprop import GradientTape @@ -1295,3 +1303,148 @@ def record_tensor_value(self, tensor_name, tensor_value): if self._is_collection_being_saved_for_step(CollectionKeys.METRICS): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) + + def start_profiling_start_train_batch(self): + print('Start profiling train batch') + + self.start = time.time() + # santiy check + if self._is_not_supported(): + return + # set mode to TRAIN + self.set_mode(ModeKeys.TRAIN) + + self.profiling_native_training = True + if self.profiling_native_training: + self._increment_step() + # if self.step_incremented_in_on_train_begin is False: + # self._increment_step() + # else: + # self.step_incremented_in_on_train_begin = False + + print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) + + # load the profiler config + self.profiler_config_parser.load_config() + + + # Dataloader + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ) and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_START_FLAG_FILENAME + ): + print('Dataloader profiling') + self.is_dataloader_profiling = True + elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + self.is_dataloader_profiling = False + + # python profiling + if python_profiler: + print('Stop python profiling in start train batch') + python_profiler.stop_profiling( + StepPhase.STEP_START, + end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + end_step=self.mode_steps[ModeKeys.TRAIN], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + print('Start python profiling in start train batch') + python_profiler.start_profiling( + StepPhase.STEP_START, + start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + start_step=self.mode_steps[ModeKeys.TRAIN], + ) + + # detail profiling + if is_profiler_supported_for_tf_version(): + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + if not self.is_detailed_profiling: + self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( + self.profiler_config_parser.config.local_path, + "tensorflow", + self.mode_steps[ModeKeys.TRAIN], + ) + self.logger.info( + f"Enabling TF profiler on step: = {self.mode_steps[ModeKeys.TRAIN]}" + ) + if not self.warm_up_completed: + # warming up profiler before it will be profiling. + self.tf_profiler.warmup() + self.warm_up_completed = True + self.tf_profiler.start(self._log_dir) + self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS + self.is_detailed_profiling = True + elif self.is_detailed_profiling: + self.logger.info( + f"Disabling TF profiler on step: ={self.mode_steps[ModeKeys.TRAIN]}" + ) + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + def start_profiling_end_train_batch(self): + print('End profiling train batch') + print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) + # sanity check + if self._is_not_supported(): + return + + self.record_trace_events( + training_phase="Step:" + str(ModeKeys.TRAIN), + op_name="Step:" + str(ModeKeys.TRAIN), + phase="X", + timestamp=self.start, # this is start time for step + duration=time.time() - self.start, + pid=os.getpid(), + step_num=str(self.mode_steps[ModeKeys.TRAIN]), + ) + + if python_profiler: + print('Stop python profiling in end train batch') + python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + end_step=self.mode_steps[ModeKeys.TRAIN], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + print('Start python profiling in end train batch') + python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + start_step=self.mode_steps[ModeKeys.TRAIN], + ) + + def stop_profiling_end_of_training(self): + print('\nEnd of training!') + print('\nStep Number at the end of training: ', self.step) + + # Alternatively, use self.close to close the python profiling + self.close() + + if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + print('Stop Dataloader profiling') + self.is_dataloader_profiling = False + + if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: + self.logger.info("Disabling profiler, reached end of training.") + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + self.profiling_native_training = False diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index e69de29bb..16b9e121a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -0,0 +1,144 @@ +# Standard Library +import os +from pathlib import Path + +# Third Party +import tensorflow as tf +import pytest +# from tests.tensorflow2.test_keras import helper_keras_fit + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.profiler.profiler_config_parser import ProfilerConfigParser +from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def create_hook(trial_dir): + hook = smd.KerasHook(trial_dir, save_all=True) + return hook + + +def create_model(): + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + return model + +def helper_native_tf2(): + + +def test_gradtape_tf_function(out_dir): + def get_grads(images, labels): + # with tf.GradientTape() as tape: + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(64) + model = create_model() + hook = create_hook(out_dir) + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with hook.wrap_tape(tf.GradientTape()) as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + model.save(out_dir, save_format="tf") + + + trial = smd.create_trial(out_dir) + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ + "weights/dense/kernel:0", + "weights/dense_1/kernel:0", + ] + assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ + "weights/dense/bias:0", + "weights/dense_1/bias:0", + ] + assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] + + +def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) + hook.close() + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 \ No newline at end of file From eae6a87a00bab6aa769528c09ee64aa726b608e4 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 11 Jan 2021 11:08:36 -0800 Subject: [PATCH 51/97] add tests --- smdebug/tensorflow/keras.py | 31 +- ...filer_cprofiler_config_parser_by_step.json | 7 + ...er_pyinstrument_config_parser_by_step.json | 7 + .../tensorflow2/test_native_tf2_profiling.py | 349 +++++++++++++++++- 4 files changed, 358 insertions(+), 36 deletions(-) create mode 100644 tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json create mode 100644 tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1c0ee5805..a87d3a903 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -121,8 +121,8 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag is used to handle step number increment in the tensorflow native training - # it indicated to profiling for tensorflow2 native training + # this flag is used to handle step number increment in the tensorflow native training when profiler is on + # it indicates to profiling for tensorflow2 native training self.profiling_native_training = False if python_profiler: @@ -1044,7 +1044,7 @@ def wrap_optimizer(self, optimizer): if isinstance(optimizer, tf.train.Optimizer): optimizer = self._wrap_apply_gradients(optimizer) elif isinstance(optimizer, tf.keras.optimizers.Optimizer) or is_keras_optimizer(optimizer): - # either subclasse of optimizerV2 class in tf.keras + # either subclass of optimizerV2 class in tf.keras # or keras.optimizers.Optimizer original_get_grads = optimizer.__class__.get_gradients @@ -1305,30 +1305,25 @@ def record_tensor_value(self, tensor_name, tensor_value): self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) def start_profiling_start_train_batch(self): + """ + Enabling profiler at the start of train batch when native tf2 training is used. + """ print('Start profiling train batch') self.start = time.time() - # santiy check + if self._is_not_supported(): return - # set mode to TRAIN self.set_mode(ModeKeys.TRAIN) self.profiling_native_training = True if self.profiling_native_training: self._increment_step() - # if self.step_incremented_in_on_train_begin is False: - # self._increment_step() - # else: - # self.step_incremented_in_on_train_begin = False print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) - # load the profiler config self.profiler_config_parser.load_config() - - # Dataloader if self.profiler_config_parser.should_save_metrics( MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] ) and self.profiler_config_parser.write_tf_dataloader_flag( @@ -1341,7 +1336,6 @@ def start_profiling_start_train_batch(self): ): self.is_dataloader_profiling = False - # python profiling if python_profiler: print('Stop python profiling in start train batch') python_profiler.stop_profiling( @@ -1359,7 +1353,6 @@ def start_profiling_start_train_batch(self): start_step=self.mode_steps[ModeKeys.TRAIN], ) - # detail profiling if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] @@ -1392,9 +1385,12 @@ def start_profiling_start_train_batch(self): self.is_detailed_profiling = False def start_profiling_end_train_batch(self): + """ + Enabling profiler at the end of train batch when native tf2 training is used. + """ print('End profiling train batch') print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) - # sanity check + if self._is_not_supported(): return @@ -1426,10 +1422,13 @@ def start_profiling_end_train_batch(self): ) def stop_profiling_end_of_training(self): + """ + Stop profiler at the end of training when native tf2 training is used. + """ print('\nEnd of training!') print('\nStep Number at the end of training: ', self.step) - # Alternatively, use self.close to close the python profiling + # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json new file mode 100644 index 000000000..2ab039217 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -0,0 +1,7 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2}" + } +} \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json new file mode 100644 index 000000000..325224801 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -0,0 +1,7 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"python_profiler\": \"Pyinstrument\"}" + } +} \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 16b9e121a..3e1f08b2b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -1,19 +1,40 @@ # Standard Library import os +import json +import time +from datetime import datetime from pathlib import Path +import pstats # Third Party import tensorflow as tf import pytest -# from tests.tensorflow2.test_keras import helper_keras_fit # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys +from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents from smdebug.tensorflow import KerasHook as Hook +from smdebug.profiler.profiler_constants import ( + CONVERT_TO_MICROSECS, + DEFAULT_PREFIX, + TRACE_DIRECTORY_FORMAT, + CPROFILE_NAME, + CPROFILE_STATS_FILENAME, + PYINSTRUMENT_HTML_FILENAME, + PYINSTRUMENT_JSON_FILENAME, + PYINSTRUMENT_NAME, +) +from smdebug.profiler.python_profiler import PythonProfiler +from smdebug.profiler.python_profiler import ( + PyinstrumentPythonProfiler, + cProfilePythonProfiler, + cProfileTimer, +) +from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -22,6 +43,20 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): return ProfilerConfigParser() +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + @pytest.fixture() def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") @@ -29,9 +64,24 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -def create_hook(trial_dir): - hook = smd.KerasHook(trial_dir, save_all=True) - return hook +@pytest.fixture +def test_framework(): + return "test-framework" + + +@pytest.fixture() +def cprofile_python_profiler(out_dir, test_framework): + return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) + + +@pytest.fixture() +def pyinstrument_python_profiler(out_dir, test_framework): + return PyinstrumentPythonProfiler(out_dir, test_framework) + + +@pytest.fixture() +def framework_dir(out_dir, test_framework): + return "{0}/framework/{1}".format(out_dir, test_framework) def create_model(): @@ -46,12 +96,10 @@ def create_model(): ) return model -def helper_native_tf2(): +def helper_native_tf2_profiler_debugger(trial_dir, hook): -def test_gradtape_tf_function(out_dir): def get_grads(images, labels): - # with tf.GradientTape() as tape: return model(images, training=True) @tf.function @@ -63,32 +111,29 @@ def train_step(images, labels): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(64) + dataset = dataset.shuffle(1000).batch(128) model = create_model() - hook = create_hook(out_dir) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - dataset_labels = labels labels = tf.one_hot(labels, depth=10) hook.start_profiling_start_train_batch() with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(out_dir, save_format="tf") - + model.save(trial_dir, save_format="tf") - trial = smd.create_trial(out_dir) + trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ "weights/dense/kernel:0", @@ -109,7 +154,40 @@ def train_step(images, labels): assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): +def helper_native_tf2_profiler(trial_dir, hook): + + def get_grads(images, labels): + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(128) + model = create_model() + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with tf.GradientTape() as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): """ This test executes a TF2 native training script, enables detailed TF profiling by step, and verifies the number of events. @@ -117,8 +195,7 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) - hook.close() + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -141,4 +218,236 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 \ No newline at end of file + assert num_trace_events >= 230 + + +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +# @pytest.mark.parametrize("use_pyinstrument", [False, True]) +# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) +def test_native_python_profiling_cprofiler( + out_dir, tf2_python_cprofiler_config_parser_by_step +): + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + config = tf2_python_cprofiler_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + config = tf2_python_pyinstrument_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.profiler_name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is meant to test successful creation of the timeline file according to file path specification. + $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ + {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict \ No newline at end of file From cc7699530ad635c958923961db884cff09868fd6 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 12:05:19 -0800 Subject: [PATCH 52/97] add python profiler as attr for kerashook --- smdebug/tensorflow/keras.py | 57 ++++++++++++------- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 6 +- 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index a87d3a903..009efe230 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,7 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) +print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -125,7 +126,11 @@ def __init__( # it indicates to profiling for tensorflow2 native training self.profiling_native_training = False - if python_profiler: + self.python_profiler = python_profiler + print('\nObject inside: ', self.python_profiler) + + if self.python_profiler: + print('exit stop profiling object inside: ', self.python_profiler, self.step) atexit.register(python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): @@ -753,13 +758,13 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], ) - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -837,8 +842,8 @@ def _on_any_batch_begin(self, batch, mode, logs=None): ): self.is_dataloader_profiling = False - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], @@ -846,7 +851,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -1007,8 +1012,8 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True - if python_profiler: - python_profiler.stop_profiling( + if self.python_profiler: + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), end_step=self.mode_steps[mode], @@ -1016,7 +1021,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - python_profiler.start_profiling( + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), start_step=self.mode_steps[mode], @@ -1103,14 +1108,16 @@ def unwrap(func): def close(self): self._cleanup() print('\nStep Number in the close function: ', self.step) - if python_profiler: + if self.python_profiler: print('python profiling for end of last train step to end of training') - python_profiler.start_profiling( + print('close start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) + def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1308,7 +1315,15 @@ def start_profiling_start_train_batch(self): """ Enabling profiler at the start of train batch when native tf2 training is used. """ - print('Start profiling train batch') + + # print('Start profiling train batch') + # a = PythonProfiler.get_python_profiler(profiler_config_parser.config, 'tensorflow') + # print('\nProfiler enabled: ', a) + # + # print('\nname: ', self.profiler_config_parser.config.python_profiling_config.name) + # print('\npython profiling: ', self.python_profiler) + # print('\nstart_step: ', self.profiler_config_parser.config.python_profiling_config.start_step) + # print('\nnum_steps: ', self.profiler_config_parser.config.python_profiling_config.num_steps) self.start = time.time() @@ -1336,9 +1351,10 @@ def start_profiling_start_train_batch(self): ): self.is_dataloader_profiling = False - if python_profiler: + if self.python_profiler: print('Stop python profiling in start train batch') - python_profiler.stop_profiling( + print('start train batch stop profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), end_step=self.mode_steps[ModeKeys.TRAIN], @@ -1347,7 +1363,8 @@ def start_profiling_start_train_batch(self): MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] ): print('Start python profiling in start train batch') - python_profiler.start_profiling( + print('start train batch start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), start_step=self.mode_steps[ModeKeys.TRAIN], @@ -1404,9 +1421,10 @@ def start_profiling_end_train_batch(self): step_num=str(self.mode_steps[ModeKeys.TRAIN]), ) - if python_profiler: + if self.python_profiler: print('Stop python profiling in end train batch') - python_profiler.stop_profiling( + print('end train batch stop profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), end_step=self.mode_steps[ModeKeys.TRAIN], @@ -1415,7 +1433,8 @@ def start_profiling_end_train_batch(self): MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] ): print('Start python profiling in end train batch') - python_profiler.start_profiling( + print('end train batch start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), start_step=self.mode_steps[ModeKeys.TRAIN], diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 325224801..02bc8c0d3 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"python_profiler\": \"Pyinstrument\"}" + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" } } \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 3e1f08b2b..9ba8dd60b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -173,6 +173,8 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + + # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -338,6 +340,7 @@ def test_native_python_profiling_cprofiler( print('\nnum_steps: ', config.python_profiling_config.num_steps) python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -347,6 +350,7 @@ def test_native_python_profiling_cprofiler( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. @@ -384,7 +388,7 @@ def test_native_python_profiling_pyinstrument( print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps From 80ff23a158a3a790f4afe2996059fdb8ec5834c2 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 16:01:59 -0800 Subject: [PATCH 53/97] add tests --- smdebug/tensorflow/keras.py | 2 +- ...filer_cprofiler_config_parser_by_step.json | 2 +- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 34 ++++++++++++------- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 009efe230..d31017179 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -131,7 +131,7 @@ def __init__( if self.python_profiler: print('exit stop profiling object inside: ', self.python_profiler, self.step) - atexit.register(python_profiler.stop_profiling, StepPhase.END) + atexit.register(self.python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): if self.distribution_strategy is None: diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index 2ab039217..d568b471c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2}" + "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } } \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 02bc8c0d3..c1c45594c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" + "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 9ba8dd60b..9a7416be3 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path import pstats +import atexit # Third Party import tensorflow as tf @@ -84,6 +85,18 @@ def framework_dir(out_dir, test_framework): return "{0}/framework/{1}".format(out_dir, test_framework) +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig + python_profiler = None + if profiler_config_parser.profiling_enabled: + config = profiler_config_parser.config + if config.python_profiling_config.is_enabled(): + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + python_profiler.start_profiling(StepPhase.START) + atexit.register(python_profiler.stop_profiling, StepPhase.END) + return profiler_config_parser, python_profiler + + def create_model(): model = tf.keras.models.Sequential( [ @@ -131,7 +144,7 @@ def train_step(images, labels): hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(trial_dir, save_format="tf") + # model.save(trial_dir, save_format="tf") trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] @@ -173,8 +186,6 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - - # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -196,7 +207,7 @@ def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parse """ assert tf2_profiler_config_parser_by_step.profiling_enabled - hook = Hook(out_dir=out_dir) + hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -333,13 +344,13 @@ def test_native_python_profiling_cprofiler( ): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - config = tf2_python_cprofiler_config_parser_by_step.config + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + + config = profiler_config_parser.config print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) print('\nname: ', config.python_profiling_config.name) print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps @@ -382,13 +393,9 @@ def test_native_python_profiling_pyinstrument( ): assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - config = tf2_python_pyinstrument_config_parser_by_step.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.profiler_name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -398,6 +405,7 @@ def test_native_python_profiling_pyinstrument( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. From 544b3e7d6a5fd479bc837c94cdc707918c0d573e Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 14:16:24 -0800 Subject: [PATCH 54/97] update profiler for native tf training --- smdebug/tensorflow/keras.py | 138 +++--- ...iler_all_params_config_parser_by_step.json | 8 + ...filer_cprofiler_config_parser_by_step.json | 2 +- ...er_pyinstrument_config_parser_by_step.json | 2 +- ...ofiling.py => test_native_tf2_profiler.py} | 397 +++++++++--------- 5 files changed, 271 insertions(+), 276 deletions(-) create mode 100644 tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json rename tests/profiler/tensorflow2/{test_native_tf2_profiling.py => test_native_tf2_profiler.py} (57%) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index d31017179..dbfc8bf1d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,7 +61,7 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -print('prezero-step start profiling object outside: ', python_profiler) +# print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -106,6 +106,7 @@ def __init__( # Profiling vars self.tf_profiler = None if is_profiler_supported_for_tf_version(): + # Third Party from tensorflow.python.profiler import profiler_v2 as tf_profiler self.tf_profiler = tf_profiler @@ -114,6 +115,7 @@ def __init__( self.is_dataloader_profiling = False self.tf_profiler_start_time_in_micros = 0 self.warm_up_completed = False + self.python_profiler = python_profiler # supports_tf_logs property was introduced in TF 2.3.0 # it indicates to the framework that the callback is not # limited to reading only numpy logs @@ -122,15 +124,10 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag is used to handle step number increment in the tensorflow native training when profiler is on - # it indicates to profiling for tensorflow2 native training - self.profiling_native_training = False - - self.python_profiler = python_profiler - print('\nObject inside: ', self.python_profiler) + # this flag indicates to debugging for tensorflow2 native training + self.debugger_native_training = False if self.python_profiler: - print('exit stop profiling object inside: ', self.python_profiler, self.step) atexit.register(self.python_profiler.stop_profiling, StepPhase.END) def _is_not_supported(self): @@ -340,6 +337,7 @@ def _create_tensors_for_matching_collections( def _get_distributed_model(self, mode): # not available in tf 1.13, code shouldn't reach here for 1.13 # because of _is_not_supported + # Third Party from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) @@ -1107,16 +1105,15 @@ def unwrap(func): def close(self): self._cleanup() - print('\nStep Number in the close function: ', self.step) + print("\nStep Number in the close function: ", self.step) if self.python_profiler: - print('python profiling for end of last train step to end of training') - print('close start profiling object inside: ', self.python_profiler, self.step) + # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - + self.debugger_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1155,11 +1152,11 @@ def run(*args, **kwargs): self._prepare_collections() self.prepared_collections = True - if not self.profiling_native_training: - self._increment_step() - print('\nStep number in the push tape: ', self.step) + self._increment_step() + print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): + # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() if self.last_saved_step is not None and self._exported_collections is False: @@ -1247,7 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - print('\nStep number in the pop tape: ', self.step) + print("\nStep number in the pop tape: ", self.step) return run @@ -1274,16 +1271,12 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # # Disable python profiling, because now we are starting wrap tape. - # if python_profiler: - # python_profiler.stop_profiling( - # StepPhase.STEP_START, - # end_mode=mode_keys_to_python_profile_mode(self.mode), - # end_step=0, - # ) - + # Third Party from tensorflow.python.eager.backprop import GradientTape + self.debugger_native_training = True + self.set_mode(ModeKeys.TRAIN) + if isinstance(tape, GradientTape): # unwrap tape before wrapping new tape to avoid recursive wrap tapes if self.tape: @@ -1311,78 +1304,67 @@ def record_tensor_value(self, tensor_name, tensor_value): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) - def start_profiling_start_train_batch(self): + def profiling_start_batch(self, mode): """ Enabling profiler at the start of train batch when native tf2 training is used. """ - - # print('Start profiling train batch') - # a = PythonProfiler.get_python_profiler(profiler_config_parser.config, 'tensorflow') - # print('\nProfiler enabled: ', a) - # - # print('\nname: ', self.profiler_config_parser.config.python_profiling_config.name) - # print('\npython profiling: ', self.python_profiler) - # print('\nstart_step: ', self.profiler_config_parser.config.python_profiling_config.start_step) - # print('\nnum_steps: ', self.profiler_config_parser.config.python_profiling_config.num_steps) - self.start = time.time() if self._is_not_supported(): return - self.set_mode(ModeKeys.TRAIN) - self.profiling_native_training = True - if self.profiling_native_training: - self._increment_step() + self.set_mode(mode) - print('\nStep Number in start train batch: ', self.mode_steps[ModeKeys.TRAIN]) + if not self.debugger_native_training: + self.step += 1 + self.mode_steps[self.mode] += 1 + # Increment Global step number irrespective of what mode it is + if self.mode != ModeKeys.GLOBAL: + self.mode_steps[ModeKeys.GLOBAL] = self.step + + print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DATALOADER_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] ) and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_START_FLAG_FILENAME ): - print('Dataloader profiling') self.is_dataloader_profiling = True elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): self.is_dataloader_profiling = False if self.python_profiler: - print('Stop python profiling in start train batch') - print('start train batch stop profiling object inside: ', self.python_profiler, self.step) + # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - end_step=self.mode_steps[ModeKeys.TRAIN], + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], ) if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - print('Start python profiling in start train batch') - print('start train batch start profiling object inside: ', self.python_profiler, self.step) + # print("Start python profiling in start train batch") self.python_profiler.start_profiling( StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - start_step=self.mode_steps[ModeKeys.TRAIN], + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], ) if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( self.profiler_config_parser.config.local_path, "tensorflow", - self.mode_steps[ModeKeys.TRAIN], - ) - self.logger.info( - f"Enabling TF profiler on step: = {self.mode_steps[ModeKeys.TRAIN]}" + self.mode_steps[mode], ) + self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") if not self.warm_up_completed: # warming up profiler before it will be profiling. self.tf_profiler.warmup() @@ -1391,9 +1373,7 @@ def start_profiling_start_train_batch(self): self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS self.is_detailed_profiling = True elif self.is_detailed_profiling: - self.logger.info( - f"Disabling TF profiler on step: ={self.mode_steps[ModeKeys.TRAIN]}" - ) + self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") stop_tf_profiler( tf_profiler=self.tf_profiler, log_dir=self._log_dir, @@ -1401,59 +1381,55 @@ def start_profiling_start_train_batch(self): ) self.is_detailed_profiling = False - def start_profiling_end_train_batch(self): + def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - print('End profiling train batch') - print('\nStep Number in end train batch: ', self.mode_steps[ModeKeys.TRAIN]) + print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return self.record_trace_events( - training_phase="Step:" + str(ModeKeys.TRAIN), - op_name="Step:" + str(ModeKeys.TRAIN), + training_phase="Step:" + str(mode), + op_name="Step:" + str(mode), phase="X", timestamp=self.start, # this is start time for step duration=time.time() - self.start, pid=os.getpid(), - step_num=str(self.mode_steps[ModeKeys.TRAIN]), + step_num=str(self.mode_steps[mode]), ) if self.python_profiler: - print('Stop python profiling in end train batch') - print('end train batch stop profiling object inside: ', self.python_profiler, self.step) + # print("Stop python profiling in end train batch") self.python_profiler.stop_profiling( StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - end_step=self.mode_steps[ModeKeys.TRAIN], + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], ) if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - print('Start python profiling in end train batch') - print('end train batch start profiling object inside: ', self.python_profiler, self.step) + # print("Start python profiling in end train batch") self.python_profiler.start_profiling( StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - start_step=self.mode_steps[ModeKeys.TRAIN], + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], ) - def stop_profiling_end_of_training(self): + def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - print('\nEnd of training!') - print('\nStep Number at the end of training: ', self.step) + print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): - print('Stop Dataloader profiling') + # print("Stop Dataloader profiling") self.is_dataloader_profiling = False if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: @@ -1464,5 +1440,3 @@ def stop_profiling_end_of_training(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False - - self.profiling_native_training = False diff --git a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json new file mode 100644 index 000000000..c119eebf8 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json @@ -0,0 +1,8 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "DetailedProfilingConfig": "{\"StartStep\": 2, \"NumSteps\": 2}", + "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" + } +} diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index d568b471c..e51c386c2 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} \ No newline at end of file +} diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index c1c45594c..53ac1485e 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } -} \ No newline at end of file +} diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py similarity index 57% rename from tests/profiler/tensorflow2/test_native_tf2_profiling.py rename to tests/profiler/tensorflow2/test_native_tf2_profiler.py index 9a7416be3..be2b6e124 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -1,41 +1,36 @@ # Standard Library -import os +import atexit import json +import os +import pstats import time from datetime import datetime from pathlib import Path -import pstats -import atexit # Third Party -import tensorflow as tf import pytest +import tensorflow as tf # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys -from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser -from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX -from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents -from smdebug.tensorflow import KerasHook as Hook from smdebug.profiler.profiler_constants import ( CONVERT_TO_MICROSECS, - DEFAULT_PREFIX, - TRACE_DIRECTORY_FORMAT, CPROFILE_NAME, CPROFILE_STATS_FILENAME, + DEFAULT_PREFIX, PYINSTRUMENT_HTML_FILENAME, PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_NAME, + TENSORBOARDTIMELINE_SUFFIX, + TRACE_DIRECTORY_FORMAT, ) +from smdebug.profiler.python_profile_utils import StepPhase from smdebug.profiler.python_profiler import PythonProfiler -from smdebug.profiler.python_profiler import ( - PyinstrumentPythonProfiler, - cProfilePythonProfiler, - cProfileTimer, -) -from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -46,14 +41,18 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @pytest.fixture() def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json" + ) monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) return ProfilerConfigParser() @pytest.fixture() def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json" + ) monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) return ProfilerConfigParser() @@ -65,24 +64,13 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -@pytest.fixture -def test_framework(): - return "test-framework" - - @pytest.fixture() -def cprofile_python_profiler(out_dir, test_framework): - return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) - - -@pytest.fixture() -def pyinstrument_python_profiler(out_dir, test_framework): - return PyinstrumentPythonProfiler(out_dir, test_framework) - - -@pytest.fixture() -def framework_dir(out_dir, test_framework): - return "{0}/framework/{1}".format(out_dir, test_framework) +def tf2_profiler_config_parser_by_step_all_params(config_folder, monkeypatch): + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_all_params_config_parser_by_step.json" + ) + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() def set_up_profiling(profilerconfig): @@ -110,65 +98,19 @@ def create_model(): return model -def helper_native_tf2_profiler_debugger(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - +def prepare_dataset(): mnist = tf.keras.datasets.mnist (x_train, y_train), _ = mnist.load_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with hook.wrap_tape(tf.GradientTape()) as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - # model.save(trial_dir, save_format="tf") - - trial = smd.create_trial(trial_dir) - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ - "weights/dense/kernel:0", - "weights/dense_1/kernel:0", - ] - assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ - "weights/dense/bias:0", - "weights/dense_1/bias:0", - ] - assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] - + dataset = dataset.shuffle(1000).batch(64) + return dataset -def helper_native_tf2_profiler(trial_dir, hook): +def helper_native_tf2_gradtape( + hook, debugger=False, python_profiler=None, start_step=None, end_step=None +): def get_grads(images, labels): return model(images, training=True) @@ -176,73 +118,51 @@ def get_grads(images, labels): def train_step(images, labels): return tf.reduce_mean(get_grads(images, labels)) - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) + dataset = prepare_dataset() model = create_model() opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + current_step = 0 n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with tf.GradientTape() as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() + if debugger: + with hook.wrap_tape(tf.GradientTape()) as tape: + hook.profiling_start_batch(mode=smd.modes.TRAIN) + logits = train_step(data, labels) + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step + assert python_profiler._start_phase == StepPhase.STEP_START + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + else: + with tf.GradientTape() as tape: + hook.profiling_start_batch(mode=smd.modes.TRAIN) + logits = train_step(data, labels) + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step + assert python_profiler._start_phase == StepPhase.STEP_START + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.profiling_end_batch(mode=smd.modes.TRAIN) + hook.profiling_end() @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and + This test executes a TF2 native training script with profiler, enables detailed TF profiling by step, and verifies the number of events. """ assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) t_events = TensorboardProfilerEvents() @@ -269,15 +189,15 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and + This test executes a TF2 native training script with profiler, enables detailed TF profiling by time, and verifies the number of events. """ assert tf2_profiler_config_parser_by_time.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) # get tensorboard timeline files files = [] @@ -302,67 +222,33 @@ def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parse # consecutive runs. Hence, the approximation in the below asserts. assert num_trace_events >= 700 + @pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): +def test_native_python_profiling_cprofiler(out_dir, tf2_python_cprofiler_config_parser_by_step): """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. + This test executes a TF2 native training script with profiler, enables cprofiler by step, and + verifies the python profiling's steps and expected output files. """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -# @pytest.mark.parametrize("use_pyinstrument", [False, True]) -# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) -def test_native_python_profiling_cprofiler( - out_dir, tf2_python_cprofiler_config_parser_by_step -): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + profiler_config_parser, python_profiler = set_up_profiling( + tf2_python_cprofiler_config_parser_by_step + ) config = profiler_config_parser.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) - print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps profiler_name = CPROFILE_NAME allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) hook = Hook(out_dir=out_dir) hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape( + hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) # Test that directory and corresponding files exist. assert os.path.isdir(python_stats_dir) @@ -370,6 +256,9 @@ def test_native_python_profiling_cprofiler( for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is (end_step - start_step) * 2 + 2 - 1. assert len(stats_dirs) == (end_step - start_step) * 2 + 1 for stats_dir in stats_dirs: @@ -388,12 +277,19 @@ def test_native_python_profiling_cprofiler( assert json.load(f) +@pytest.mark.skip_if_non_eager def test_native_python_profiling_pyinstrument( out_dir, tf2_python_pyinstrument_config_parser_by_step ): + """ + This test executes a TF2 native training script with profiler, enables pyinstrument by step, and + verifies the python profiling's steps and expected output files. + """ assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) + profiler_config_parser, python_profiler = set_up_profiling( + tf2_python_pyinstrument_config_parser_by_step + ) config = profiler_config_parser.config start_step = config.python_profiling_config.start_step @@ -402,11 +298,13 @@ def test_native_python_profiling_pyinstrument( profiler_name = PYINSTRUMENT_NAME allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) hook = Hook(out_dir=out_dir) hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape( + hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) # Test that directory and corresponding files exist. assert os.path.isdir(python_stats_dir) @@ -414,6 +312,9 @@ def test_native_python_profiling_pyinstrument( for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is (end_step - start_step) * 2 + 2 - 1. assert len(stats_dirs) == (end_step - start_step) * 2 + 1 for stats_dir in stats_dirs: @@ -432,17 +333,16 @@ def test_native_python_profiling_pyinstrument( assert json.load(f) +@pytest.mark.skip_if_non_eager def test_create_timeline_file(simple_profiler_config_parser, out_dir): """ - This test is meant to test successful creation of the timeline file according to file path specification. - $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ - {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + This test is to test the creation of the timeline file according to file path specification. It reads backs the file contents to make sure it is in valid JSON format. """ assert simple_profiler_config_parser.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + helper_native_tf2_gradtape(hook=hook) files = [] for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): @@ -462,4 +362,117 @@ def test_create_timeline_file(simple_profiler_config_parser, out_dir): with open(files[0]) as timeline_file: events_dict = json.load(timeline_file) - assert events_dict \ No newline at end of file + assert events_dict + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_debugger_all_params( + tf2_profiler_config_parser_by_step_all_params, out_dir +): + """ + This test executes a TF2 native training script with debugger and profiler, enables detailed TF profiling, python + profiling by step. + """ + assert tf2_profiler_config_parser_by_step_all_params.profiling_enabled + + profiler_config_parser, python_profiler = set_up_profiling( + tf2_profiler_config_parser_by_step_all_params + ) + + config = profiler_config_parser.config + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) + + hook = Hook(out_dir=out_dir, save_all=True) + hook.python_profiler = python_profiler + helper_native_tf2_gradtape(hook=hook, debugger=True) + + # Verifying python profiling related files. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + # Verifying detailed TF profiling. + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path( + tf2_profiler_config_parser_by_step_all_params.config.local_path + "/framework" + ).rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + # Verifying timeline files. + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict + + # Verifying tensor names. + trial = smd.create_trial(out_dir) + assert len(trial.steps()) > 0, "Nothing saved at any step." + assert len(trial.tensor_names()) > 0, "Tensors were not saved." + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 + assert trial.tensor_names(collection="optimizer_variables") == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] From 31dfb8de960cadc5089a41f9783709e1ddcc8aec Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 15:25:40 -0700 Subject: [PATCH 55/97] Modify distributed_training_utils.py import for TF 2.4 (#422) --- smdebug/tensorflow/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index af9a0e901..11d700dfc 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -427,6 +427,10 @@ def is_tf_version_greater_than_2_4_x(): return version.parse("2.4.0") <= TF_VERSION +def is_tf_version_greater_than_2_4_x(): + return version.parse("2.4.0") <= version.parse(tf.__version__) + + def is_profiler_supported_for_tf_version(): # Profiler Support Added For TF Versions 2.2.0 And Greater return version.parse("2.2.0") <= TF_VERSION From 0a064503f4d204a652e97a67130a3da31d9013cb Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 19:48:15 -0800 Subject: [PATCH 56/97] remove print statement --- smdebug/tensorflow/keras.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index dbfc8bf1d..439ee09a8 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1105,7 +1105,7 @@ def unwrap(func): def close(self): self._cleanup() - print("\nStep Number in the close function: ", self.step) + # print("\nStep Number in the close function: ", self.step) if self.python_profiler: # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( @@ -1153,7 +1153,7 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - print("\nStep number in the push tape: ", self.step) + # print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1244,7 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - print("\nStep number in the pop tape: ", self.step) + # print("\nStep number in the pop tape: ", self.step) return run @@ -1322,7 +1322,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - print("Step Number in start train batch: ", self.mode_steps[mode]) + # print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1385,7 +1385,7 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - print("Step Number in end train batch: ", self.mode_steps[mode]) + # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1421,7 +1421,7 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - print("Step Number at the end of training: ", self.step) + # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() From 1f2a2d383768b99e5f96969505af88e4553c95b4 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:32:23 -0800 Subject: [PATCH 57/97] add changes for enabling profiling for native tf2 training --- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py new file mode 100644 index 000000000..e69de29bb From 7fb9c5b85a3509c01641294d439017381422db45 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:42:55 -0800 Subject: [PATCH 58/97] add changes for enabling profiling in the native tf2 training --- smdebug/tensorflow/keras.py | 7 +- .../tensorflow2/test_native_tf2_profiling.py | 144 ++++++++++++++++++ 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 439ee09a8..22ae7b588 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1271,7 +1271,6 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # Third Party from tensorflow.python.eager.backprop import GradientTape self.debugger_native_training = True @@ -1356,7 +1355,7 @@ def profiling_start_batch(self, mode): if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( @@ -1386,7 +1385,6 @@ def profiling_end_batch(self, mode): Enabling profiler at the end of train batch when native tf2 training is used. """ # print("Step Number in end train batch: ", self.mode_steps[mode]) - if self._is_not_supported(): return @@ -1427,7 +1425,7 @@ def profiling_end(self): self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): # print("Stop Dataloader profiling") self.is_dataloader_profiling = False @@ -1440,3 +1438,4 @@ def profiling_end(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False + diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index e69de29bb..16b9e121a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -0,0 +1,144 @@ +# Standard Library +import os +from pathlib import Path + +# Third Party +import tensorflow as tf +import pytest +# from tests.tensorflow2.test_keras import helper_keras_fit + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.profiler.profiler_config_parser import ProfilerConfigParser +from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def create_hook(trial_dir): + hook = smd.KerasHook(trial_dir, save_all=True) + return hook + + +def create_model(): + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + return model + +def helper_native_tf2(): + + +def test_gradtape_tf_function(out_dir): + def get_grads(images, labels): + # with tf.GradientTape() as tape: + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(64) + model = create_model() + hook = create_hook(out_dir) + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with hook.wrap_tape(tf.GradientTape()) as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + model.save(out_dir, save_format="tf") + + + trial = smd.create_trial(out_dir) + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ + "weights/dense/kernel:0", + "weights/dense_1/kernel:0", + ] + assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ + "weights/dense/bias:0", + "weights/dense_1/bias:0", + ] + assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] + + +def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) + hook.close() + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 \ No newline at end of file From 21699de9c56f2ac9bc5136a3167fc7ac5d348023 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 11 Jan 2021 11:08:36 -0800 Subject: [PATCH 59/97] add tests --- ...filer_cprofiler_config_parser_by_step.json | 1 + ...er_pyinstrument_config_parser_by_step.json | 1 + .../tensorflow2/test_native_tf2_profiling.py | 349 +++++++++++++++++- 3 files changed, 331 insertions(+), 20 deletions(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index e51c386c2..f06218f77 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -5,3 +5,4 @@ "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } } + diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 53ac1485e..ad5a555f7 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -5,3 +5,4 @@ "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } + diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 16b9e121a..3e1f08b2b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -1,19 +1,40 @@ # Standard Library import os +import json +import time +from datetime import datetime from pathlib import Path +import pstats # Third Party import tensorflow as tf import pytest -# from tests.tensorflow2.test_keras import helper_keras_fit # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys +from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents from smdebug.tensorflow import KerasHook as Hook +from smdebug.profiler.profiler_constants import ( + CONVERT_TO_MICROSECS, + DEFAULT_PREFIX, + TRACE_DIRECTORY_FORMAT, + CPROFILE_NAME, + CPROFILE_STATS_FILENAME, + PYINSTRUMENT_HTML_FILENAME, + PYINSTRUMENT_JSON_FILENAME, + PYINSTRUMENT_NAME, +) +from smdebug.profiler.python_profiler import PythonProfiler +from smdebug.profiler.python_profiler import ( + PyinstrumentPythonProfiler, + cProfilePythonProfiler, + cProfileTimer, +) +from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -22,6 +43,20 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): return ProfilerConfigParser() +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + @pytest.fixture() def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") @@ -29,9 +64,24 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -def create_hook(trial_dir): - hook = smd.KerasHook(trial_dir, save_all=True) - return hook +@pytest.fixture +def test_framework(): + return "test-framework" + + +@pytest.fixture() +def cprofile_python_profiler(out_dir, test_framework): + return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) + + +@pytest.fixture() +def pyinstrument_python_profiler(out_dir, test_framework): + return PyinstrumentPythonProfiler(out_dir, test_framework) + + +@pytest.fixture() +def framework_dir(out_dir, test_framework): + return "{0}/framework/{1}".format(out_dir, test_framework) def create_model(): @@ -46,12 +96,10 @@ def create_model(): ) return model -def helper_native_tf2(): +def helper_native_tf2_profiler_debugger(trial_dir, hook): -def test_gradtape_tf_function(out_dir): def get_grads(images, labels): - # with tf.GradientTape() as tape: return model(images, training=True) @tf.function @@ -63,32 +111,29 @@ def train_step(images, labels): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(64) + dataset = dataset.shuffle(1000).batch(128) model = create_model() - hook = create_hook(out_dir) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - dataset_labels = labels labels = tf.one_hot(labels, depth=10) hook.start_profiling_start_train_batch() with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(out_dir, save_format="tf") - + model.save(trial_dir, save_format="tf") - trial = smd.create_trial(out_dir) + trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ "weights/dense/kernel:0", @@ -109,7 +154,40 @@ def train_step(images, labels): assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): +def helper_native_tf2_profiler(trial_dir, hook): + + def get_grads(images, labels): + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(128) + model = create_model() + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with tf.GradientTape() as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): """ This test executes a TF2 native training script, enables detailed TF profiling by step, and verifies the number of events. @@ -117,8 +195,7 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) - hook.close() + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -141,4 +218,236 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 \ No newline at end of file + assert num_trace_events >= 230 + + +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +# @pytest.mark.parametrize("use_pyinstrument", [False, True]) +# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) +def test_native_python_profiling_cprofiler( + out_dir, tf2_python_cprofiler_config_parser_by_step +): + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + config = tf2_python_cprofiler_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + config = tf2_python_pyinstrument_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.profiler_name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is meant to test successful creation of the timeline file according to file path specification. + $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ + {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict \ No newline at end of file From 8241846e7fd198f4dcb18ac03c061527e3953fc8 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 12:05:19 -0800 Subject: [PATCH 60/97] add python profiler as attr for kerashook --- smdebug/tensorflow/keras.py | 9 ++------- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 6 +++++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 22ae7b588..e1111045b 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,7 +61,6 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -# print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -1105,9 +1104,8 @@ def unwrap(func): def close(self): self._cleanup() - # print("\nStep Number in the close function: ", self.step) + if self.python_profiler: - # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1115,6 +1113,7 @@ def close(self): ) self.debugger_native_training = False + def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1337,7 +1336,6 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: - # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1346,7 +1344,6 @@ def profiling_start_batch(self, mode): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - # print("Start python profiling in start train batch") self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(mode), @@ -1399,7 +1396,6 @@ def profiling_end_batch(self, mode): ) if self.python_profiler: - # print("Stop python profiling in end train batch") self.python_profiler.stop_profiling( StepPhase.STEP_END, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1408,7 +1404,6 @@ def profiling_end_batch(self, mode): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - # print("Start python profiling in end train batch") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(mode), diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 3e1f08b2b..9ba8dd60b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -173,6 +173,8 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + + # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -338,6 +340,7 @@ def test_native_python_profiling_cprofiler( print('\nnum_steps: ', config.python_profiling_config.num_steps) python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -347,6 +350,7 @@ def test_native_python_profiling_cprofiler( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. @@ -384,7 +388,7 @@ def test_native_python_profiling_pyinstrument( print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps From 92c5749fcbfa8921ff883699ed651e1d7192b994 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 16:01:59 -0800 Subject: [PATCH 61/97] add tests --- smdebug/tensorflow/keras.py | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 34 ++++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e1111045b..2b8c2a9eb 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -316,7 +316,7 @@ def _create_tensors_for_matching_collections( for t in tensor_refs: self.tensor_to_collections[t.name] = colls_with_tensor elif colls_with_tensor: - # we should only readd tensors which were already added if these are variables + # we should only read tensors which were already added if these are variables # other tensors are part of a different mode, and will cause a crash if fetched # because their input placeholders will not be passed. if any( diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 9ba8dd60b..9a7416be3 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path import pstats +import atexit # Third Party import tensorflow as tf @@ -84,6 +85,18 @@ def framework_dir(out_dir, test_framework): return "{0}/framework/{1}".format(out_dir, test_framework) +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig + python_profiler = None + if profiler_config_parser.profiling_enabled: + config = profiler_config_parser.config + if config.python_profiling_config.is_enabled(): + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + python_profiler.start_profiling(StepPhase.START) + atexit.register(python_profiler.stop_profiling, StepPhase.END) + return profiler_config_parser, python_profiler + + def create_model(): model = tf.keras.models.Sequential( [ @@ -131,7 +144,7 @@ def train_step(images, labels): hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(trial_dir, save_format="tf") + # model.save(trial_dir, save_format="tf") trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] @@ -173,8 +186,6 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - - # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -196,7 +207,7 @@ def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parse """ assert tf2_profiler_config_parser_by_step.profiling_enabled - hook = Hook(out_dir=out_dir) + hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -333,13 +344,13 @@ def test_native_python_profiling_cprofiler( ): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - config = tf2_python_cprofiler_config_parser_by_step.config + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + + config = profiler_config_parser.config print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) print('\nname: ', config.python_profiling_config.name) print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps @@ -382,13 +393,9 @@ def test_native_python_profiling_pyinstrument( ): assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - config = tf2_python_pyinstrument_config_parser_by_step.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.profiler_name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -398,6 +405,7 @@ def test_native_python_profiling_pyinstrument( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. From 28509f42c74bd0ae1746f5511acc33a2c8a38337 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 14:16:24 -0800 Subject: [PATCH 62/97] update profiler for native tf training --- smdebug/tensorflow/keras.py | 24 +- ...filer_cprofiler_config_parser_by_step.json | 3 +- ...er_pyinstrument_config_parser_by_step.json | 1 - .../tensorflow2/test_native_tf2_profiling.py | 465 ------------------ 4 files changed, 16 insertions(+), 477 deletions(-) delete mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2b8c2a9eb..72f15e9f0 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,10 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) +<<<<<<< HEAD +======= +# print('prezero-step start profiling object outside: ', python_profiler) +>>>>>>> update profiler for native tf training class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -144,6 +148,7 @@ def _is_not_supported(self): self._hook_supported = False elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: try: +<<<<<<< HEAD if is_tf_version_greater_than_2_4_x(): # distributed_training_utils.py renamed to distributed_training_utils_v1 in tf 2.4.0 from tensorflow.python.keras.distribute.distributed_training_utils_v1 import ( @@ -154,6 +159,12 @@ def _is_not_supported(self): get_distributed_model, ) +======= + # Third Party + from tensorflow.python.keras.distribute.distributed_training_utils import ( + get_distributed_model, + ) +>>>>>>> update profiler for native tf training except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( @@ -1106,6 +1117,7 @@ def close(self): self._cleanup() if self.python_profiler: + # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1113,7 +1125,6 @@ def close(self): ) self.debugger_native_training = False - def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1152,7 +1163,6 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - # print("\nStep number in the push tape: ", self.step) if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1243,7 +1253,6 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - # print("\nStep number in the pop tape: ", self.step) return run @@ -1320,7 +1329,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - # print("Step Number in start train batch: ", self.mode_steps[mode]) + print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1336,6 +1345,7 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: + # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1381,7 +1391,6 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1414,13 +1423,11 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - # print("Step Number at the end of training: ", self.step) - # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME + TF_DATALOADER_END_FLAG_FILENAME ): # print("Stop Dataloader profiling") self.is_dataloader_profiling = False @@ -1433,4 +1440,3 @@ def profiling_end(self): start_time_us=self.tf_profiler_start_time_in_micros, ) self.is_detailed_profiling = False - diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index f06218f77..d568b471c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,5 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} - +} \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index ad5a555f7..53ac1485e 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -5,4 +5,3 @@ "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } - diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py deleted file mode 100644 index 9a7416be3..000000000 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ /dev/null @@ -1,465 +0,0 @@ -# Standard Library -import os -import json -import time -from datetime import datetime -from pathlib import Path -import pstats -import atexit - -# Third Party -import tensorflow as tf -import pytest - -# First Party -import smdebug.tensorflow as smd -from smdebug.core.collection import CollectionKeys -from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter -from smdebug.profiler.profiler_config_parser import ProfilerConfigParser -from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX -from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents -from smdebug.tensorflow import KerasHook as Hook -from smdebug.profiler.profiler_constants import ( - CONVERT_TO_MICROSECS, - DEFAULT_PREFIX, - TRACE_DIRECTORY_FORMAT, - CPROFILE_NAME, - CPROFILE_STATS_FILENAME, - PYINSTRUMENT_HTML_FILENAME, - PYINSTRUMENT_JSON_FILENAME, - PYINSTRUMENT_NAME, -) -from smdebug.profiler.python_profiler import PythonProfiler -from smdebug.profiler.python_profiler import ( - PyinstrumentPythonProfiler, - cProfilePythonProfiler, - cProfileTimer, -) -from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase - -@pytest.fixture() -def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture -def test_framework(): - return "test-framework" - - -@pytest.fixture() -def cprofile_python_profiler(out_dir, test_framework): - return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) - - -@pytest.fixture() -def pyinstrument_python_profiler(out_dir, test_framework): - return PyinstrumentPythonProfiler(out_dir, test_framework) - - -@pytest.fixture() -def framework_dir(out_dir, test_framework): - return "{0}/framework/{1}".format(out_dir, test_framework) - - -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig - python_profiler = None - if profiler_config_parser.profiling_enabled: - config = profiler_config_parser.config - if config.python_profiling_config.is_enabled(): - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") - python_profiler.start_profiling(StepPhase.START) - atexit.register(python_profiler.stop_profiling, StepPhase.END) - return profiler_config_parser, python_profiler - - -def create_model(): - model = tf.keras.models.Sequential( - [ - # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 - tf.keras.layers.Flatten(input_shape=(28, 28, 1)), - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dropout(0.2), - tf.keras.layers.Dense(10, activation="softmax"), - ] - ) - return model - - -def helper_native_tf2_profiler_debugger(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with hook.wrap_tape(tf.GradientTape()) as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - # model.save(trial_dir, save_format="tf") - - trial = smd.create_trial(trial_dir) - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ - "weights/dense/kernel:0", - "weights/dense_1/kernel:0", - ] - assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ - "weights/dense/bias:0", - "weights/dense_1/bias:0", - ] - assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] - - -def helper_native_tf2_profiler(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with tf.GradientTape() as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -# @pytest.mark.parametrize("use_pyinstrument", [False, True]) -# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) -def test_native_python_profiling_cprofiler( - out_dir, tf2_python_cprofiler_config_parser_by_step -): - assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) - - config = profiler_config_parser.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) - print('\ntest function: ', python_profiler) - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_native_python_profiling_pyinstrument( - out_dir, tf2_python_pyinstrument_config_parser_by_step -): - assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = PYINSTRUMENT_NAME - allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_create_timeline_file(simple_profiler_config_parser, out_dir): - """ - This test is meant to test successful creation of the timeline file according to file path specification. - $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ - {$ENV_NODE_ID_4digits0padded}_pythontimeline.json - It reads backs the file contents to make sure it is in valid JSON format. - """ - assert simple_profiler_config_parser.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): - files.append(path) - - assert len(files) == 1 - - file_ts = files[0].name.split("_")[0] - folder_name = files[0].parent.name - assert folder_name == time.strftime( - TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) - ) - assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( - TRACE_DIRECTORY_FORMAT - ) - - with open(files[0]) as timeline_file: - events_dict = json.load(timeline_file) - - assert events_dict \ No newline at end of file From b9bf0c244c2204fe1a2bbf45b11f2d22723694fa Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 15:25:40 -0700 Subject: [PATCH 63/97] Modify distributed_training_utils.py import for TF 2.4 (#422) --- smdebug/tensorflow/keras.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 72f15e9f0..e9a7a9635 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,10 +61,6 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -<<<<<<< HEAD -======= -# print('prezero-step start profiling object outside: ', python_profiler) ->>>>>>> update profiler for native tf training class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -148,7 +144,6 @@ def _is_not_supported(self): self._hook_supported = False elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: try: -<<<<<<< HEAD if is_tf_version_greater_than_2_4_x(): # distributed_training_utils.py renamed to distributed_training_utils_v1 in tf 2.4.0 from tensorflow.python.keras.distribute.distributed_training_utils_v1 import ( @@ -159,12 +154,6 @@ def _is_not_supported(self): get_distributed_model, ) -======= - # Third Party - from tensorflow.python.keras.distribute.distributed_training_utils import ( - get_distributed_model, - ) ->>>>>>> update profiler for native tf training except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( From 19e2a9c2ef5239ec8f99503fd4426375a7fe6f64 Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 16:27:58 -0700 Subject: [PATCH 64/97] Cache TF Versions (#421) --- smdebug/tensorflow/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 11d700dfc..c26247229 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -428,7 +428,7 @@ def is_tf_version_greater_than_2_4_x(): def is_tf_version_greater_than_2_4_x(): - return version.parse("2.4.0") <= version.parse(tf.__version__) + return version.parse("2.4.0") <= TF_VERSION def is_profiler_supported_for_tf_version(): From a636c38442fcf8cee0a791f3238f85c10b17ea16 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 19:48:15 -0800 Subject: [PATCH 65/97] remove print statement --- smdebug/tensorflow/keras.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index e9a7a9635..4f79ea629 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1104,7 +1104,7 @@ def unwrap(func): def close(self): self._cleanup() - + # print("\nStep Number in the close function: ", self.step) if self.python_profiler: # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( @@ -1152,6 +1152,8 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() + # print("\nStep number in the push tape: ", self.step) + if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1242,6 +1244,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step + # print("\nStep number in the pop tape: ", self.step) return run @@ -1318,7 +1321,7 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - print("Step Number in start train batch: ", self.mode_steps[mode]) + # print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1380,6 +1383,7 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ + # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1412,6 +1416,7 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ + # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() From 1781732ff78691c88cf4c9ad39714d554c20fb35 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 22:02:39 -0800 Subject: [PATCH 66/97] clean up the code --- smdebug/tensorflow/keras.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 4f79ea629..8485677a9 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1152,8 +1152,6 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - # print("\nStep number in the push tape: ", self.step) - if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1244,7 +1242,6 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - # print("\nStep number in the pop tape: ", self.step) return run @@ -1314,6 +1311,8 @@ def profiling_start_batch(self, mode): self.set_mode(mode) + # When only profiler is enabled in the native tf2 training, + # increasing the step number in the TRAIN and GLOBAL mode. if not self.debugger_native_training: self.step += 1 self.mode_steps[self.mode] += 1 @@ -1321,8 +1320,6 @@ def profiling_start_batch(self, mode): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - # print("Step Number in start train batch: ", self.mode_steps[mode]) - self.profiler_config_parser.load_config() if self.profiler_config_parser.should_save_metrics( @@ -1337,7 +1334,6 @@ def profiling_start_batch(self, mode): self.is_dataloader_profiling = False if self.python_profiler: - # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1354,7 +1350,7 @@ def profiling_start_batch(self, mode): if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] ): if not self.is_detailed_profiling: self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( @@ -1383,7 +1379,6 @@ def profiling_end_batch(self, mode): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1416,14 +1411,12 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_END_FLAG_FILENAME ): - # print("Stop Dataloader profiling") self.is_dataloader_profiling = False if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: From 4b81d123631022d58c1699218079876a9d1be060 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 23:22:21 -0800 Subject: [PATCH 67/97] clean up code --- smdebug/tensorflow/keras.py | 6 +----- smdebug/tensorflow/utils.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 8485677a9..267d71257 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -105,7 +105,6 @@ def __init__( # Profiling vars self.tf_profiler = None if is_profiler_supported_for_tf_version(): - # Third Party from tensorflow.python.profiler import profiler_v2 as tf_profiler self.tf_profiler = tf_profiler @@ -336,7 +335,6 @@ def _create_tensors_for_matching_collections( def _get_distributed_model(self, mode): # not available in tf 1.13, code shouldn't reach here for 1.13 # because of _is_not_supported - # Third Party from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) @@ -1104,9 +1102,8 @@ def unwrap(func): def close(self): self._cleanup() - # print("\nStep Number in the close function: ", self.step) + if self.python_profiler: - # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1154,7 +1151,6 @@ def run(*args, **kwargs): self._increment_step() if self._get_collections_to_save_for_step(): - # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() if self.last_saved_step is not None and self._exported_collections is False: diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index c26247229..af9a0e901 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -427,10 +427,6 @@ def is_tf_version_greater_than_2_4_x(): return version.parse("2.4.0") <= TF_VERSION -def is_tf_version_greater_than_2_4_x(): - return version.parse("2.4.0") <= TF_VERSION - - def is_profiler_supported_for_tf_version(): # Profiler Support Added For TF Versions 2.2.0 And Greater return version.parse("2.2.0") <= TF_VERSION From 375ad1de8c3b689dcf1feaea4dbe84af2baf1cfa Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 23:56:45 -0800 Subject: [PATCH 68/97] update format --- ...est_tf2_python_profiler_cprofiler_config_parser_by_step.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index d568b471c..e51c386c2 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} \ No newline at end of file +} From 7dcd4010541a9df79bbbfdf55d131b8ef670359e Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Thu, 21 Jan 2021 19:43:32 -0800 Subject: [PATCH 69/97] revise on PR --- smdebug/core/hook.py | 9 +- smdebug/tensorflow/keras.py | 286 ++++------ ...iler_all_params_config_parser_by_step.json | 8 - ...filer_cprofiler_config_parser_by_step.json | 7 - ...er_pyinstrument_config_parser_by_step.json | 7 - .../tensorflow2/test_native_tf2_profiler.py | 538 +++++++++--------- 6 files changed, 368 insertions(+), 487 deletions(-) delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py index 04fceb585..9d8df5dd6 100644 --- a/smdebug/core/hook.py +++ b/smdebug/core/hook.py @@ -558,18 +558,19 @@ def _cleanup(self): if self.first_process is True: remove_claim_file(self.out_dir) - def _increment_step(self): + def _increment_step(self, write_state=True): # Update the last_state to the last step number that was saved or seen - self._write_state() + if write_state: + self._write_state() + self.written_tensor_name_for_step.clear() + self._collections_to_save_for_step = None self.step += 1 self.mode_steps[self.mode] += 1 - self.written_tensor_name_for_step.clear() # Increment Global step number irrespective of what mode it is if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - self._collections_to_save_for_step = None # Called in the internal AWS codebase to determine # if a particular tensor value should be saved diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 267d71257..d5d200647 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -714,6 +714,94 @@ def _remove_fetches_and_callbacks(self, mode): x.fetch_callbacks.pop(tf_obj) self._fetches_added.clear() + def _start_phase_python_profiling(self, mode): + if self.python_profiler: + self.python_profiler.stop_profiling( + StepPhase.STEP_START, + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] + ): + self.python_profiler.start_profiling( + StepPhase.STEP_START, + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], + ) + + def _end_phase_python_profiling(self, mode): + if self.python_profiler: + self.python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] + ): + self.python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], + ) + + def _begin_detailed_profiling(self, mode=ModeKeys.TRAIN): + if is_profiler_supported_for_tf_version(): + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + ): + if not self.is_detailed_profiling: + self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( + self.profiler_config_parser.config.local_path, + "tensorflow", + self.mode_steps[mode], + ) + self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") + if not self.warm_up_completed: + # warming up profiler before it will be profiling. + self.tf_profiler.warmup() + self.warm_up_completed = True + self.tf_profiler.start(self._log_dir) + self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS + self.is_detailed_profiling = True + elif self.is_detailed_profiling: + self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + def _end_detailed_profiling(self): + if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: + self.logger.info("Disabling profiler, reached end of training.") + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False + + def _begin_dataloader_profiling(self, mode): + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] + ) and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_START_FLAG_FILENAME + ): + self.is_dataloader_profiling = True + elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + self.is_dataloader_profiling = False + + def _end_dataloader_profiling(self): + if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + self.is_dataloader_profiling = False + def on_epoch_begin(self, batch, logs=None): pass @@ -753,34 +841,12 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - self.python_profiler.start_profiling( - StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) - - if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False + self._end_phase_python_profiling(mode=mode) + self._end_detailed_profiling() def on_train_end(self, logs=None): self._on_any_mode_end(ModeKeys.TRAIN) - - if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: - self.logger.info("Disabling profiler, reached end of training.") - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False + self._end_detailed_profiling() # throws error in keras if this fn is absent def on_test_end(self, logs=None): @@ -826,31 +892,9 @@ def _on_any_batch_begin(self, batch, mode, logs=None): self.profiler_config_parser.load_config() - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] - ) and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_START_FLAG_FILENAME - ): - self.is_dataloader_profiling = True - elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False + self._begin_dataloader_profiling(mode=mode) - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) + self._start_phase_python_profiling(mode=mode) if self.prepared_collections is False: # sets prepared_collections to True here @@ -884,37 +928,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): def on_train_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.TRAIN, logs=logs) - - if is_profiler_supported_for_tf_version(): - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[ModeKeys.TRAIN] - ): - if not self.is_detailed_profiling: - self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( - self.profiler_config_parser.config.local_path, - "tensorflow", - self.mode_steps[ModeKeys.TRAIN], - ) - self.logger.info( - f"Enabling TF profiler on step: = {self.mode_steps[ModeKeys.TRAIN]}" - ) - if not self.warm_up_completed: - # warming up profiler before it will be profiling. - self.tf_profiler.warmup() - self.warm_up_completed = True - self.tf_profiler.start(self._log_dir) - self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS - self.is_detailed_profiling = True - elif self.is_detailed_profiling: - self.logger.info( - f"Disabling TF profiler on step: ={self.mode_steps[ModeKeys.TRAIN]}" - ) - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False + self._begin_detailed_profiling() def on_test_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.EVAL, logs=logs) @@ -1007,20 +1021,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) + self._end_phase_python_profiling(mode=mode) def on_train_batch_end(self, batch, logs=None): self._on_any_batch_end(batch, ModeKeys.TRAIN, logs=logs) @@ -1296,7 +1297,7 @@ def record_tensor_value(self, tensor_name, tensor_value): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) - def profiling_start_batch(self, mode): + def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. """ @@ -1308,70 +1309,20 @@ def profiling_start_batch(self, mode): self.set_mode(mode) # When only profiler is enabled in the native tf2 training, - # increasing the step number in the TRAIN and GLOBAL mode. + # increasing the step number in the TRAIN and GLOBAL mode + # and not writing the state. if not self.debugger_native_training: - self.step += 1 - self.mode_steps[self.mode] += 1 - # Increment Global step number irrespective of what mode it is - if self.mode != ModeKeys.GLOBAL: - self.mode_steps[ModeKeys.GLOBAL] = self.step + self._increment_step(write_state=self.debugger_native_training) self.profiler_config_parser.load_config() - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] - ) and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_START_FLAG_FILENAME - ): - self.is_dataloader_profiling = True - elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False + self._begin_dataloader_profiling(mode=mode) - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) + self._start_phase_python_profiling(mode=mode) - if is_profiler_supported_for_tf_version(): - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] - ): - if not self.is_detailed_profiling: - self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( - self.profiler_config_parser.config.local_path, - "tensorflow", - self.mode_steps[mode], - ) - self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") - if not self.warm_up_completed: - # warming up profiler before it will be profiling. - self.tf_profiler.warmup() - self.warm_up_completed = True - self.tf_profiler.start(self._log_dir) - self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS - self.is_detailed_profiling = True - elif self.is_detailed_profiling: - self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False + self._begin_detailed_profiling(mode=mode) - def profiling_end_batch(self, mode): + def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. """ @@ -1388,20 +1339,7 @@ def profiling_end_batch(self, mode): step_num=str(self.mode_steps[mode]), ) - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) + self._end_phase_python_profiling(mode=mode) def profiling_end(self): """ @@ -1409,17 +1347,5 @@ def profiling_end(self): """ # Unwrap the tape before closing and close the python profiling self.close() - - if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False - - if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: - self.logger.info("Disabling profiler, reached end of training.") - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False + self._end_dataloader_profiling() + self._end_detailed_profiling() diff --git a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json deleted file mode 100644 index c119eebf8..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "DetailedProfilingConfig": "{\"StartStep\": 2, \"NumSteps\": 2}", - "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" - } -} diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json deleted file mode 100644 index e51c386c2..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" - } -} diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json deleted file mode 100644 index 53ac1485e..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" - } -} diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index be2b6e124..f3ddea198 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -32,49 +32,121 @@ from smdebug.tensorflow import KerasHook as Hook -@pytest.fixture() -def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") +@pytest.fixture +def profiler_config_path(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "profiler_config.json") monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_step_all_params(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_all_params_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - + yield config_path + if os.path.isfile(config_path): + os.remove(config_path) + + +def _convert_to_string(item): + return '"{0}"'.format(item) if isinstance(item, str) else item + + +def _convert_key_and_value(key, value): + return "{0}: {1}, ".format(_convert_to_string(key), _convert_to_string(value)) + + +def generate_profiler_config_parser(profiling_type, profiler_config_path, profiling_parameters): + python_profiler_config, detailed_profiler_config = "{}", "{}" + + if profiling_type == "PythonProfiling": + start_step, num_steps, profiler_name, cprofile_timer = profiling_parameters + python_profiler_config = "{" + if start_step is not None: + python_profiler_config += _convert_key_and_value("StartStep", start_step) + if num_steps is not None: + python_profiler_config += _convert_key_and_value("NumSteps", num_steps) + if profiler_name is not None: + python_profiler_config += _convert_key_and_value("ProfilerName", profiler_name) + if cprofile_timer is not None: + python_profiler_config += _convert_key_and_value("cProfileTimer", cprofile_timer) + python_profiler_config += "}" + + if profiling_type == "DetailedProfiling": + start_step, num_steps, start_time, duration = profiling_parameters + detailed_profiler_config = "{" + if start_step: + detailed_profiler_config += _convert_key_and_value("StartStep", start_step) + if num_steps: + detailed_profiler_config += _convert_key_and_value("NumSteps", num_steps) + if start_time: + detailed_profiler_config += _convert_key_and_value( + "StartTimeInSecSinceEpoch", start_time + ) + if duration: + detailed_profiler_config += _convert_key_and_value("DurationInSeconds", duration) + detailed_profiler_config += "}" + + full_config = { + "ProfilingParameters": { + "ProfilerEnabled": True, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": python_profiler_config, + "DetailedProfilingConfig": detailed_profiler_config, + } + } + + with open(profiler_config_path, "w") as f: + json.dump(full_config, f) + + profiler_config_parser = ProfilerConfigParser() + assert profiler_config_parser.profiling_enabled + + return profiler_config_parser + + +def generate_profiler_config_parser_all_params( + profiler_config_path, python_profiling_parameters, detailed_profiling_parameters +): -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig + start_step_1, num_steps_1, profiler_name, cprofile_timer = python_profiling_parameters + start_step_2, num_steps_2, start_time, duration = detailed_profiling_parameters + + python_profiler_config = "{" + if start_step_1 is not None: + python_profiler_config += _convert_key_and_value("StartStep", start_step_1) + if num_steps_1 is not None: + python_profiler_config += _convert_key_and_value("NumSteps", num_steps_1) + if profiler_name is not None: + python_profiler_config += _convert_key_and_value("ProfilerName", profiler_name) + if cprofile_timer is not None: + python_profiler_config += _convert_key_and_value("cProfileTimer", cprofile_timer) + python_profiler_config += "}" + + detailed_profiler_config = "{" + if start_step_2: + detailed_profiler_config += _convert_key_and_value("StartStep", start_step_2) + if num_steps_2: + detailed_profiler_config += _convert_key_and_value("NumSteps", num_steps_2) + if start_time: + detailed_profiler_config += _convert_key_and_value("StartTimeInSecSinceEpoch", start_time) + if duration: + detailed_profiler_config += _convert_key_and_value("DurationInSeconds", duration) + detailed_profiler_config += "}" + + full_config = { + "ProfilingParameters": { + "ProfilerEnabled": True, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": python_profiler_config, + "DetailedProfilingConfig": detailed_profiler_config, + } + } + + with open(profiler_config_path, "w") as f: + json.dump(full_config, f) + + profiler_config_parser = ProfilerConfigParser() + assert profiler_config_parser.profiling_enabled + + return profiler_config_parser + + +def set_up_profiling(profiler_config_parser): + profiler_config_parser = profiler_config_parser python_profiler = None if profiler_config_parser.profiling_enabled: config = profiler_config_parser.config @@ -123,17 +195,17 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - current_step = 0 + step = 0 n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: labels = tf.one_hot(labels, depth=10) if debugger: with hook.wrap_tape(tf.GradientTape()) as tape: - hook.profiling_start_batch(mode=smd.modes.TRAIN) + hook.profiling_start_batch() logits = train_step(data, labels) - if python_profiler and start_step <= current_step < end_step: - assert python_profiler._start_step == current_step + if python_profiler and start_step <= step < end_step: + assert python_profiler._start_step == step assert python_profiler._start_phase == StepPhase.STEP_START grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) @@ -142,33 +214,33 @@ def train_step(images, labels): hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) else: with tf.GradientTape() as tape: - hook.profiling_start_batch(mode=smd.modes.TRAIN) + hook.profiling_start_batch() logits = train_step(data, labels) - if python_profiler and start_step <= current_step < end_step: - assert python_profiler._start_step == current_step + if python_profiler and start_step <= step < end_step: + assert python_profiler._start_step == step assert python_profiler._start_phase == StepPhase.STEP_START grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - hook.profiling_end_batch(mode=smd.modes.TRAIN) + hook.profiling_end_batch() + if python_profiler and start_step <= step < end_step: + assert python_profiler._start_step == step + assert python_profiler._start_phase == StepPhase.STEP_END + step += 1 hook.profiling_end() + if python_profiler: + assert python_profiler._start_step == step - 1 + assert python_profiler._start_phase == StepPhase.STEP_END -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): +def verify_num_trace_events(profilerconfig): """ - This test executes a TF2 native training script with profiler, enables detailed TF profiling by step, and - verifies the number of events. + This verifies the number of events when detailed profiling is enabled. """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - t_events = TensorboardProfilerEvents() # get tensorboard timeline files files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + for path in Path(profilerconfig.config.local_path + "/framework").rglob( f"*{TENSORBOARDTIMELINE_SUFFIX}" ): files.append(path) @@ -188,162 +260,38 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step assert num_trace_events >= 230 -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script with profiler, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -@pytest.mark.skip_if_non_eager -def test_native_python_profiling_cprofiler(out_dir, tf2_python_cprofiler_config_parser_by_step): - """ - This test executes a TF2 native training script with profiler, enables cprofiler by step, and - verifies the python profiling's steps and expected output files. - """ - assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling( - tf2_python_cprofiler_config_parser_by_step - ) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape( - hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step - ) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - # Since python_profiler.stop_profiling for the posthookclose step automatically executed - # upon normal interpreter termination, - # the number of the files is (end_step - start_step) * 2 + 2 - 1. - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) +def train_loop(out_dir, debugger=False): + hook = Hook(out_dir=out_dir, save_all=True) + helper_native_tf2_gradtape(hook=hook, debugger=debugger) -@pytest.mark.skip_if_non_eager -def test_native_python_profiling_pyinstrument( - out_dir, tf2_python_pyinstrument_config_parser_by_step -): +def verify_tensor_names(out_dir): """ - This test executes a TF2 native training script with profiler, enables pyinstrument by step, and - verifies the python profiling's steps and expected output files. + This verifies the tensor names when debugger is enabled. """ - assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling( - tf2_python_pyinstrument_config_parser_by_step - ) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = PYINSTRUMENT_NAME - allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape( - hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step - ) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - # Since python_profiler.stop_profiling for the posthookclose step automatically executed - # upon normal interpreter termination, - # the number of the files is (end_step - start_step) * 2 + 2 - 1. - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) + trial = smd.create_trial(out_dir) + assert len(trial.steps()) > 0, "Nothing saved at any step." + assert len(trial.tensor_names()) > 0, "Tensors were not saved." + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 + assert trial.tensor_names(collection="optimizer_variables") == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -@pytest.mark.skip_if_non_eager -def test_create_timeline_file(simple_profiler_config_parser, out_dir): +def verify_timeline_file(out_dir): """ - This test is to test the creation of the timeline file according to file path specification. + This verifies the creation of the timeline file according to file path specification. It reads backs the file contents to make sure it is in valid JSON format. """ - assert simple_profiler_config_parser.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - files = [] for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): files.append(path) @@ -365,40 +313,47 @@ def test_create_timeline_file(simple_profiler_config_parser, out_dir): assert events_dict -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_debugger_all_params( - tf2_profiler_config_parser_by_step_all_params, out_dir -): +def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=False): """ - This test executes a TF2 native training script with debugger and profiler, enables detailed TF profiling, python - profiling by step. + This executes a TF2 native training script with profiler or both profiler and debugger, + enables python profiling by step, and verifies the python profiling's steps and expected output files. """ - assert tf2_profiler_config_parser_by_step_all_params.profiling_enabled + assert profilerconfig.profiling_enabled - profiler_config_parser, python_profiler = set_up_profiling( - tf2_profiler_config_parser_by_step_all_params - ) + profiler_config_parser, python_profiler = set_up_profiling(profilerconfig) config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) + if profiler_name == CPROFILE_NAME: + allowed_files = [CPROFILE_STATS_FILENAME] + if profiler_name == PYINSTRUMENT_NAME: + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) hook = Hook(out_dir=out_dir, save_all=True) hook.python_profiler = python_profiler - helper_native_tf2_gradtape(hook=hook, debugger=True) + helper_native_tf2_gradtape( + hook=hook, + python_profiler=python_profiler, + start_step=start_step, + end_step=end_step, + debugger=debugger, + ) - # Verifying python profiling related files. + # Test that directory and corresponding files exist. assert os.path.isdir(python_stats_dir) for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is num_steps * 2 + 2 - 1. + assert len(stats_dirs) == num_steps * 2 + 1 for stats_dir in stats_dirs: # Validate that the expected files are in the stats dir @@ -415,64 +370,85 @@ def test_native_tf2_profiler_debugger_all_params( with open(stats_path, "r") as f: assert json.load(f) - # Verifying detailed TF profiling. - t_events = TensorboardProfilerEvents() - # get tensorboard timeline files - files = [] - for path in Path( - tf2_profiler_config_parser_by_step_all_params.config.local_path + "/framework" - ).rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - # Verifying timeline files. - files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): - files.append(path) - - assert len(files) == 1 - - file_ts = files[0].name.split("_")[0] - folder_name = files[0].parent.name - assert folder_name == time.strftime( - TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) - ) - assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( - TRACE_DIRECTORY_FORMAT - ) - - with open(files[0]) as timeline_file: - events_dict = json.load(timeline_file) - - assert events_dict - - # Verifying tensor names. - trial = smd.create_trial(out_dir) - assert len(trial.steps()) > 0, "Nothing saved at any step." - assert len(trial.tensor_names()) > 0, "Tensors were not saved." - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 - assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 - assert trial.tensor_names(collection="optimizer_variables") == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] +@pytest.mark.skip_if_non_eager +@pytest.mark.parametrize("enable_detailed_profiling", [False, True]) +@pytest.mark.parametrize("enable_python_profiling", [False, CPROFILE_NAME, PYINSTRUMENT_NAME]) +@pytest.mark.parametrize("enable_debugger", [False, True]) +def test_native_tf2_profiling_debugger( + enable_detailed_profiling, + enable_python_profiling, + enable_debugger, + profiler_config_path, + out_dir, +): + if not enable_debugger: + if enable_detailed_profiling and not enable_python_profiling: + profiler_config_parser = generate_profiler_config_parser( + "DetailedProfiling", profiler_config_path, (8, 4, None, None) + ) + train_loop(out_dir) + verify_num_trace_events(profiler_config_parser) + verify_timeline_file(out_dir) + elif not enable_detailed_profiling and enable_python_profiling: + if enable_python_profiling == CPROFILE_NAME: + profiler_config_parser = generate_profiler_config_parser( + "PythonProfiling", profiler_config_path, (5, 2, CPROFILE_NAME, None) + ) + verify_python_profiling(CPROFILE_NAME, out_dir, profiler_config_parser) + verify_timeline_file(out_dir) + if enable_python_profiling == PYINSTRUMENT_NAME: + profiler_config_parser = generate_profiler_config_parser( + "PythonProfiling", profiler_config_path, (10, 3, PYINSTRUMENT_NAME, None) + ) + verify_python_profiling(PYINSTRUMENT_NAME, out_dir, profiler_config_parser) + verify_timeline_file(out_dir) + elif enable_detailed_profiling and enable_python_profiling: + profiler_config_parser = generate_profiler_config_parser_all_params( + profiler_config_path, (4, 2, enable_python_profiling, None), (8, 1, None, None) + ) + verify_python_profiling(enable_python_profiling, out_dir, profiler_config_parser) + verify_num_trace_events(profiler_config_parser) + verify_timeline_file(out_dir) + else: + pass + else: + if enable_detailed_profiling and not enable_python_profiling: + profiler_config_parser = generate_profiler_config_parser( + "DetailedProfiling", profiler_config_path, (8, 4, None, None) + ) + train_loop(out_dir, debugger=True) + verify_num_trace_events(profiler_config_parser) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) + elif not enable_detailed_profiling and enable_python_profiling: + if enable_python_profiling == CPROFILE_NAME: + profiler_config_parser = generate_profiler_config_parser( + "PythonProfiling", profiler_config_path, (5, 2, CPROFILE_NAME, None) + ) + verify_python_profiling( + CPROFILE_NAME, out_dir, profiler_config_parser, debugger=True + ) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) + if enable_python_profiling == PYINSTRUMENT_NAME: + profiler_config_parser = generate_profiler_config_parser( + "PythonProfiling", profiler_config_path, (10, 3, PYINSTRUMENT_NAME, None) + ) + verify_python_profiling( + PYINSTRUMENT_NAME, out_dir, profiler_config_parser, debugger=True + ) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) + elif enable_detailed_profiling and enable_python_profiling: + profiler_config_parser = generate_profiler_config_parser_all_params( + profiler_config_path, (4, 2, enable_python_profiling, None), (8, 1, None, None) + ) + verify_python_profiling( + enable_python_profiling, out_dir, profiler_config_parser, debugger=True + ) + verify_num_trace_events(profiler_config_parser) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) + else: + pass From ae97c0d88f5eb84f9fae9b6b8d6bb634bc8bef1c Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:32:23 -0800 Subject: [PATCH 70/97] add changes for enabling profiling for native tf2 training --- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py new file mode 100644 index 000000000..e69de29bb From 3a35cfb6742ffcab267b7ecf7dc6daa052570ecf Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:42:55 -0800 Subject: [PATCH 71/97] add changes for enabling profiling in the native tf2 training --- smdebug/tensorflow/keras.py | 11 +- .../tensorflow2/test_native_tf2_profiling.py | 144 ++++++++++++++++++ 2 files changed, 150 insertions(+), 5 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index d5d200647..0a3a3222d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -122,8 +122,9 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag indicates to debugging for tensorflow2 native training - self.debugger_native_training = False + # this flag is used to handle step number increment in the tensorflow native training + # it indicated to profiling for tensorflow2 native training + self.profiling_native_training = False if self.python_profiler: atexit.register(self.python_profiler.stop_profiling, StepPhase.END) @@ -1110,7 +1111,6 @@ def close(self): start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - self.debugger_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1149,7 +1149,8 @@ def run(*args, **kwargs): self._prepare_collections() self.prepared_collections = True - self._increment_step() + if not self.profiling_native_training: + self._increment_step() if self._get_collections_to_save_for_step(): self._initialize_writers() @@ -1267,7 +1268,6 @@ def wrap_tape(self, tape): """ from tensorflow.python.eager.backprop import GradientTape - self.debugger_native_training = True self.set_mode(ModeKeys.TRAIN) if isinstance(tape, GradientTape): @@ -1349,3 +1349,4 @@ def profiling_end(self): self.close() self._end_dataloader_profiling() self._end_detailed_profiling() + diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index e69de29bb..16b9e121a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -0,0 +1,144 @@ +# Standard Library +import os +from pathlib import Path + +# Third Party +import tensorflow as tf +import pytest +# from tests.tensorflow2.test_keras import helper_keras_fit + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.profiler.profiler_config_parser import ProfilerConfigParser +from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def create_hook(trial_dir): + hook = smd.KerasHook(trial_dir, save_all=True) + return hook + + +def create_model(): + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + return model + +def helper_native_tf2(): + + +def test_gradtape_tf_function(out_dir): + def get_grads(images, labels): + # with tf.GradientTape() as tape: + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(64) + model = create_model() + hook = create_hook(out_dir) + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with hook.wrap_tape(tf.GradientTape()) as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + model.save(out_dir, save_format="tf") + + + trial = smd.create_trial(out_dir) + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ + "weights/dense/kernel:0", + "weights/dense_1/kernel:0", + ] + assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ + "weights/dense/bias:0", + "weights/dense_1/bias:0", + ] + assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] + + +def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) + hook.close() + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 \ No newline at end of file From 81234de8b01b8cdcab4714aad6b448b34fe1027e Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 11 Jan 2021 11:08:36 -0800 Subject: [PATCH 72/97] add tests --- smdebug/tensorflow/keras.py | 23 +- ...filer_cprofiler_config_parser_by_step.json | 7 + ...er_pyinstrument_config_parser_by_step.json | 7 + .../tensorflow2/test_native_tf2_profiling.py | 349 +++++++++++++++++- 4 files changed, 351 insertions(+), 35 deletions(-) create mode 100644 tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json create mode 100644 tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 0a3a3222d..784b26c37 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -122,8 +122,8 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag is used to handle step number increment in the tensorflow native training - # it indicated to profiling for tensorflow2 native training + # this flag is used to handle step number increment in the tensorflow native training when profiler is on + # it indicates to profiling for tensorflow2 native training self.profiling_native_training = False if self.python_profiler: @@ -1297,24 +1297,16 @@ def record_tensor_value(self, tensor_name, tensor_value): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) + def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. """ - self.start = time.time() - - if self._is_not_supported(): - return - - self.set_mode(mode) - - # When only profiler is enabled in the native tf2 training, - # increasing the step number in the TRAIN and GLOBAL mode - # and not writing the state. - if not self.debugger_native_training: - self._increment_step(write_state=self.debugger_native_training) + self.set_mode(ModeKeys.TRAIN) - self.profiler_config_parser.load_config() + self.profiling_native_training = True + if self.profiling_native_training: + self._increment_step() self._begin_dataloader_profiling(mode=mode) @@ -1326,6 +1318,7 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. """ + if self._is_not_supported(): return diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json new file mode 100644 index 000000000..2ab039217 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -0,0 +1,7 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2}" + } +} \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json new file mode 100644 index 000000000..325224801 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -0,0 +1,7 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"python_profiler\": \"Pyinstrument\"}" + } +} \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 16b9e121a..3e1f08b2b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -1,19 +1,40 @@ # Standard Library import os +import json +import time +from datetime import datetime from pathlib import Path +import pstats # Third Party import tensorflow as tf import pytest -# from tests.tensorflow2.test_keras import helper_keras_fit # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys +from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents from smdebug.tensorflow import KerasHook as Hook +from smdebug.profiler.profiler_constants import ( + CONVERT_TO_MICROSECS, + DEFAULT_PREFIX, + TRACE_DIRECTORY_FORMAT, + CPROFILE_NAME, + CPROFILE_STATS_FILENAME, + PYINSTRUMENT_HTML_FILENAME, + PYINSTRUMENT_JSON_FILENAME, + PYINSTRUMENT_NAME, +) +from smdebug.profiler.python_profiler import PythonProfiler +from smdebug.profiler.python_profiler import ( + PyinstrumentPythonProfiler, + cProfilePythonProfiler, + cProfileTimer, +) +from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -22,6 +43,20 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): return ProfilerConfigParser() +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + @pytest.fixture() def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") @@ -29,9 +64,24 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -def create_hook(trial_dir): - hook = smd.KerasHook(trial_dir, save_all=True) - return hook +@pytest.fixture +def test_framework(): + return "test-framework" + + +@pytest.fixture() +def cprofile_python_profiler(out_dir, test_framework): + return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) + + +@pytest.fixture() +def pyinstrument_python_profiler(out_dir, test_framework): + return PyinstrumentPythonProfiler(out_dir, test_framework) + + +@pytest.fixture() +def framework_dir(out_dir, test_framework): + return "{0}/framework/{1}".format(out_dir, test_framework) def create_model(): @@ -46,12 +96,10 @@ def create_model(): ) return model -def helper_native_tf2(): +def helper_native_tf2_profiler_debugger(trial_dir, hook): -def test_gradtape_tf_function(out_dir): def get_grads(images, labels): - # with tf.GradientTape() as tape: return model(images, training=True) @tf.function @@ -63,32 +111,29 @@ def train_step(images, labels): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(64) + dataset = dataset.shuffle(1000).batch(128) model = create_model() - hook = create_hook(out_dir) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - dataset_labels = labels labels = tf.one_hot(labels, depth=10) hook.start_profiling_start_train_batch() with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(out_dir, save_format="tf") - + model.save(trial_dir, save_format="tf") - trial = smd.create_trial(out_dir) + trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ "weights/dense/kernel:0", @@ -109,7 +154,40 @@ def train_step(images, labels): assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): +def helper_native_tf2_profiler(trial_dir, hook): + + def get_grads(images, labels): + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(128) + model = create_model() + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with tf.GradientTape() as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): """ This test executes a TF2 native training script, enables detailed TF profiling by step, and verifies the number of events. @@ -117,8 +195,7 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) - hook.close() + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -141,4 +218,236 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 \ No newline at end of file + assert num_trace_events >= 230 + + +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +# @pytest.mark.parametrize("use_pyinstrument", [False, True]) +# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) +def test_native_python_profiling_cprofiler( + out_dir, tf2_python_cprofiler_config_parser_by_step +): + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + config = tf2_python_cprofiler_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + config = tf2_python_pyinstrument_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.profiler_name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is meant to test successful creation of the timeline file according to file path specification. + $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ + {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict \ No newline at end of file From 77bb1975f470f1686ef5ce2df4000ff4e5b70d98 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 12:05:19 -0800 Subject: [PATCH 73/97] add python profiler as attr for kerashook --- smdebug/tensorflow/keras.py | 83 ++++++++++++++++++- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 6 +- 3 files changed, 85 insertions(+), 6 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 784b26c37..b21d2dc9f 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,6 +61,7 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) +print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -126,6 +127,7 @@ def __init__( # it indicates to profiling for tensorflow2 native training self.profiling_native_training = False + if self.python_profiler: atexit.register(self.python_profiler.stop_profiling, StepPhase.END) @@ -842,6 +844,7 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): + self._end_phase_python_profiling(mode=mode) self._end_detailed_profiling() @@ -895,7 +898,21 @@ def _on_any_batch_begin(self, batch, mode, logs=None): self._begin_dataloader_profiling(mode=mode) - self._start_phase_python_profiling(mode=mode) + if self.python_profiler: + self.python_profiler.stop_profiling( + StepPhase.STEP_START, + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] + ): + self.python_profiler.start_profiling( + StepPhase.STEP_START, + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], + ) +>>>>>>> add python profiler as attr for kerashook if self.prepared_collections is False: # sets prepared_collections to True here @@ -1022,7 +1039,22 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True - self._end_phase_python_profiling(mode=mode) + + if self.python_profiler: + self.python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] + ): + self.python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], + ) +>>>>>>> add python profiler as attr for kerashook def on_train_batch_end(self, batch, logs=None): self._on_any_batch_end(batch, ModeKeys.TRAIN, logs=logs) @@ -1105,6 +1137,7 @@ def unwrap(func): def close(self): self._cleanup() + if self.python_profiler: self.python_profiler.start_profiling( StepPhase.STEP_END, @@ -1112,6 +1145,7 @@ def close(self): start_step=self.mode_steps[self.mode], ) + def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1302,6 +1336,11 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. """ + + self.start = time.time() + + if self._is_not_supported(): + return self.set_mode(ModeKeys.TRAIN) self.profiling_native_training = True @@ -1310,7 +1349,24 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self._begin_dataloader_profiling(mode=mode) - self._start_phase_python_profiling(mode=mode) + if self.python_profiler: + print('Stop python profiling in start train batch') + print('start train batch stop profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.stop_profiling( + StepPhase.STEP_START, + end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + end_step=self.mode_steps[ModeKeys.TRAIN], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + print('Start python profiling in start train batch') + print('start train batch start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( + StepPhase.STEP_START, + start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + start_step=self.mode_steps[ModeKeys.TRAIN], + ) self._begin_detailed_profiling(mode=mode) @@ -1332,7 +1388,26 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): step_num=str(self.mode_steps[mode]), ) - self._end_phase_python_profiling(mode=mode) + + if self.python_profiler: + print('Stop python profiling in end train batch') + print('end train batch stop profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + end_step=self.mode_steps[ModeKeys.TRAIN], + ) + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + ): + print('Start python profiling in end train batch') + print('end train batch start profiling object inside: ', self.python_profiler, self.step) + self.python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), + start_step=self.mode_steps[ModeKeys.TRAIN], + ) + def profiling_end(self): """ diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 325224801..02bc8c0d3 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"python_profiler\": \"Pyinstrument\"}" + "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" } } \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 3e1f08b2b..9ba8dd60b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -173,6 +173,8 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + + # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -338,6 +340,7 @@ def test_native_python_profiling_cprofiler( print('\nnum_steps: ', config.python_profiling_config.num_steps) python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -347,6 +350,7 @@ def test_native_python_profiling_cprofiler( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. @@ -384,7 +388,7 @@ def test_native_python_profiling_pyinstrument( print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps From 72b04e3d7e3eb09e29a734fbc7c94da37ac37794 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 16:01:59 -0800 Subject: [PATCH 74/97] add tests --- ...filer_cprofiler_config_parser_by_step.json | 2 +- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiling.py | 34 ++++++++++++------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index 2ab039217..d568b471c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2}" + "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } } \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 02bc8c0d3..c1c45594c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -2,6 +2,6 @@ "ProfilingParameters": { "ProfilerEnabled": true, "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 9, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" + "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } \ No newline at end of file diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 9ba8dd60b..9a7416be3 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path import pstats +import atexit # Third Party import tensorflow as tf @@ -84,6 +85,18 @@ def framework_dir(out_dir, test_framework): return "{0}/framework/{1}".format(out_dir, test_framework) +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig + python_profiler = None + if profiler_config_parser.profiling_enabled: + config = profiler_config_parser.config + if config.python_profiling_config.is_enabled(): + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + python_profiler.start_profiling(StepPhase.START) + atexit.register(python_profiler.stop_profiling, StepPhase.END) + return profiler_config_parser, python_profiler + + def create_model(): model = tf.keras.models.Sequential( [ @@ -131,7 +144,7 @@ def train_step(images, labels): hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(trial_dir, save_format="tf") + # model.save(trial_dir, save_format="tf") trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] @@ -173,8 +186,6 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - - # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -196,7 +207,7 @@ def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parse """ assert tf2_profiler_config_parser_by_step.profiling_enabled - hook = Hook(out_dir=out_dir) + hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -333,13 +344,13 @@ def test_native_python_profiling_cprofiler( ): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - config = tf2_python_cprofiler_config_parser_by_step.config + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + + config = profiler_config_parser.config print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) print('\nname: ', config.python_profiling_config.name) print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps @@ -382,13 +393,9 @@ def test_native_python_profiling_pyinstrument( ): assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - config = tf2_python_pyinstrument_config_parser_by_step.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.profiler_name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -398,6 +405,7 @@ def test_native_python_profiling_pyinstrument( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. From eb6477d2239f2177dfb837778043b108c7cd486a Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 14:16:24 -0800 Subject: [PATCH 75/97] update profiler for native tf training --- smdebug/tensorflow/keras.py | 80 ++- ...iler_all_params_config_parser_by_step.json | 8 + ...filer_cprofiler_config_parser_by_step.json | 2 +- ...er_pyinstrument_config_parser_by_step.json | 2 +- .../tensorflow2/test_native_tf2_profiler.py | 341 +++++++++++++ .../tensorflow2/test_native_tf2_profiling.py | 465 ------------------ 6 files changed, 390 insertions(+), 508 deletions(-) create mode 100644 tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json delete mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index b21d2dc9f..27ef87658 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,7 +61,7 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -print('prezero-step start profiling object outside: ', python_profiler) +# print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -106,6 +106,7 @@ def __init__( # Profiling vars self.tf_profiler = None if is_profiler_supported_for_tf_version(): + # Third Party from tensorflow.python.profiler import profiler_v2 as tf_profiler self.tf_profiler = tf_profiler @@ -338,6 +339,7 @@ def _create_tensors_for_matching_collections( def _get_distributed_model(self, mode): # not available in tf 1.13, code shouldn't reach here for 1.13 # because of _is_not_supported + # Third Party from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) @@ -1136,15 +1138,13 @@ def unwrap(func): def close(self): self._cleanup() - - if self.python_profiler: self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - + self.debugger_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1183,10 +1183,11 @@ def run(*args, **kwargs): self._prepare_collections() self.prepared_collections = True - if not self.profiling_native_training: - self._increment_step() + self._increment_step() + if self._get_collections_to_save_for_step(): + # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() if self.last_saved_step is not None and self._exported_collections is False: @@ -1300,8 +1301,10 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ + # Third Party from tensorflow.python.eager.backprop import GradientTape + self.debugger_native_training = True self.set_mode(ModeKeys.TRAIN) if isinstance(tape, GradientTape): @@ -1341,34 +1344,49 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): if self._is_not_supported(): return - self.set_mode(ModeKeys.TRAIN) - self.profiling_native_training = True - if self.profiling_native_training: - self._increment_step() + self.set_mode(mode) - self._begin_dataloader_profiling(mode=mode) + if not self.debugger_native_training: + self.step += 1 + self.mode_steps[self.mode] += 1 + # Increment Global step number irrespective of what mode it is + if self.mode != ModeKeys.GLOBAL: + self.mode_steps[ModeKeys.GLOBAL] = self.step + + print("Step Number in start train batch: ", self.mode_steps[mode]) + + self.profiler_config_parser.load_config() + + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] + ) and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_START_FLAG_FILENAME + ): + self.is_dataloader_profiling = True + elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( + TF_DATALOADER_END_FLAG_FILENAME + ): + self.is_dataloader_profiling = False if self.python_profiler: - print('Stop python profiling in start train batch') - print('start train batch stop profiling object inside: ', self.python_profiler, self.step) + # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - end_step=self.mode_steps[ModeKeys.TRAIN], + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], ) if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] + MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - print('Start python profiling in start train batch') - print('start train batch start profiling object inside: ', self.python_profiler, self.step) + # print("Start python profiling in start train batch") self.python_profiler.start_profiling( StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - start_step=self.mode_steps[ModeKeys.TRAIN], + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], ) - self._begin_detailed_profiling(mode=mode) + def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ @@ -1389,26 +1407,6 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): ) - if self.python_profiler: - print('Stop python profiling in end train batch') - print('end train batch stop profiling object inside: ', self.python_profiler, self.step) - self.python_profiler.stop_profiling( - StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - end_step=self.mode_steps[ModeKeys.TRAIN], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[ModeKeys.TRAIN] - ): - print('Start python profiling in end train batch') - print('end train batch start profiling object inside: ', self.python_profiler, self.step) - self.python_profiler.start_profiling( - StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(ModeKeys.TRAIN), - start_step=self.mode_steps[ModeKeys.TRAIN], - ) - - def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. diff --git a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json new file mode 100644 index 000000000..c119eebf8 --- /dev/null +++ b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json @@ -0,0 +1,8 @@ +{ + "ProfilingParameters": { + "ProfilerEnabled": true, + "LocalPath": "/tmp/test", + "DetailedProfilingConfig": "{\"StartStep\": 2, \"NumSteps\": 2}", + "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" + } +} diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index d568b471c..e51c386c2 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} \ No newline at end of file +} diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index c1c45594c..53ac1485e 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } -} \ No newline at end of file +} diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index f3ddea198..986be6a46 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -32,6 +32,7 @@ from smdebug.tensorflow import KerasHook as Hook +<<<<<<< HEAD @pytest.fixture def profiler_config_path(config_folder, monkeypatch): config_path = os.path.join(config_folder, "profiler_config.json") @@ -147,6 +148,51 @@ def generate_profiler_config_parser_all_params( def set_up_profiling(profiler_config_parser): profiler_config_parser = profiler_config_parser +======= +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json" + ) + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json" + ) + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_step_all_params(config_folder, monkeypatch): + config_path = os.path.join( + config_folder, "test_tf2_python_profiler_all_params_config_parser_by_step.json" + ) + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig +>>>>>>> update profiler for native tf training python_profiler = None if profiler_config_parser.profiling_enabled: config = profiler_config_parser.config @@ -195,17 +241,28 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) +<<<<<<< HEAD step = 0 +======= + current_step = 0 +>>>>>>> update profiler for native tf training n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: labels = tf.one_hot(labels, depth=10) if debugger: with hook.wrap_tape(tf.GradientTape()) as tape: +<<<<<<< HEAD hook.profiling_start_batch() logits = train_step(data, labels) if python_profiler and start_step <= step < end_step: assert python_profiler._start_step == step +======= + hook.profiling_start_batch(mode=smd.modes.TRAIN) + logits = train_step(data, labels) + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step +>>>>>>> update profiler for native tf training assert python_profiler._start_phase == StepPhase.STEP_START grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) @@ -214,6 +271,7 @@ def train_step(images, labels): hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) else: with tf.GradientTape() as tape: +<<<<<<< HEAD hook.profiling_start_batch() logits = train_step(data, labels) if python_profiler and start_step <= step < end_step: @@ -236,11 +294,39 @@ def verify_num_trace_events(profilerconfig): """ This verifies the number of events when detailed profiling is enabled. """ +======= + hook.profiling_start_batch(mode=smd.modes.TRAIN) + logits = train_step(data, labels) + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step + assert python_profiler._start_phase == StepPhase.STEP_START + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.profiling_end_batch(mode=smd.modes.TRAIN) + hook.profiling_end() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script with profiler, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_gradtape(hook=hook) + +>>>>>>> update profiler for native tf training t_events = TensorboardProfilerEvents() # get tensorboard timeline files files = [] +<<<<<<< HEAD for path in Path(profilerconfig.config.local_path + "/framework").rglob( +======= + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( +>>>>>>> update profiler for native tf training f"*{TENSORBOARDTIMELINE_SUFFIX}" ): files.append(path) @@ -260,6 +346,7 @@ def verify_num_trace_events(profilerconfig): assert num_trace_events >= 230 +<<<<<<< HEAD def train_loop(out_dir, debugger=False): hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_gradtape(hook=hook, debugger=debugger) @@ -292,6 +379,164 @@ def verify_timeline_file(out_dir): This verifies the creation of the timeline file according to file path specification. It reads backs the file contents to make sure it is in valid JSON format. """ +======= +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script with profiler, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_gradtape(hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +@pytest.mark.skip_if_non_eager +def test_native_python_profiling_cprofiler(out_dir, tf2_python_cprofiler_config_parser_by_step): + """ + This test executes a TF2 native training script with profiler, enables cprofiler by step, and + verifies the python profiling's steps and expected output files. + """ + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + profiler_config_parser, python_profiler = set_up_profiling( + tf2_python_cprofiler_config_parser_by_step + ) + + config = profiler_config_parser.config + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) + + hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler + helper_native_tf2_gradtape( + hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is (end_step - start_step) * 2 + 2 - 1. + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +@pytest.mark.skip_if_non_eager +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + """ + This test executes a TF2 native training script with profiler, enables pyinstrument by step, and + verifies the python profiling's steps and expected output files. + """ + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + profiler_config_parser, python_profiler = set_up_profiling( + tf2_python_pyinstrument_config_parser_by_step + ) + + config = profiler_config_parser.config + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) + + hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler + helper_native_tf2_gradtape( + hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + # Since python_profiler.stop_profiling for the posthookclose step automatically executed + # upon normal interpreter termination, + # the number of the files is (end_step - start_step) * 2 + 2 - 1. + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +@pytest.mark.skip_if_non_eager +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is to test the creation of the timeline file according to file path specification. + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_gradtape(hook=hook) + +>>>>>>> update profiler for native tf training files = [] for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): files.append(path) @@ -313,6 +558,7 @@ def verify_timeline_file(out_dir): assert events_dict +<<<<<<< HEAD def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=False): """ This executes a TF2 native training script with profiler or both profiler and debugger, @@ -321,12 +567,28 @@ def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=Fal assert profilerconfig.profiling_enabled profiler_config_parser, python_profiler = set_up_profiling(profilerconfig) +======= +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_debugger_all_params( + tf2_profiler_config_parser_by_step_all_params, out_dir +): + """ + This test executes a TF2 native training script with debugger and profiler, enables detailed TF profiling, python + profiling by step. + """ + assert tf2_profiler_config_parser_by_step_all_params.profiling_enabled + + profiler_config_parser, python_profiler = set_up_profiling( + tf2_profiler_config_parser_by_step_all_params + ) +>>>>>>> update profiler for native tf training config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps +<<<<<<< HEAD if profiler_name == CPROFILE_NAME: allowed_files = [CPROFILE_STATS_FILENAME] @@ -345,15 +607,30 @@ def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=Fal ) # Test that directory and corresponding files exist. +======= + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) + + hook = Hook(out_dir=out_dir, save_all=True) + hook.python_profiler = python_profiler + helper_native_tf2_gradtape(hook=hook, debugger=True) + + # Verifying python profiling related files. +>>>>>>> update profiler for native tf training assert os.path.isdir(python_stats_dir) for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) +<<<<<<< HEAD # Since python_profiler.stop_profiling for the posthookclose step automatically executed # upon normal interpreter termination, # the number of the files is num_steps * 2 + 2 - 1. assert len(stats_dirs) == num_steps * 2 + 1 +======= + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 +>>>>>>> update profiler for native tf training for stats_dir in stats_dirs: # Validate that the expected files are in the stats dir @@ -370,6 +647,7 @@ def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=Fal with open(stats_path, "r") as f: assert json.load(f) +<<<<<<< HEAD @pytest.mark.skip_if_non_eager @pytest.mark.parametrize("enable_detailed_profiling", [False, True]) @@ -452,3 +730,66 @@ def test_native_tf2_profiling_debugger( verify_tensor_names(out_dir) else: pass +======= + # Verifying detailed TF profiling. + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path( + tf2_profiler_config_parser_by_step_all_params.config.local_path + "/framework" + ).rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + # Verifying timeline files. + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict + + # Verifying tensor names. + trial = smd.create_trial(out_dir) + assert len(trial.steps()) > 0, "Nothing saved at any step." + assert len(trial.tensor_names()) > 0, "Tensors were not saved." + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 + assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 + assert trial.tensor_names(collection="optimizer_variables") == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] +>>>>>>> update profiler for native tf training diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py deleted file mode 100644 index 9a7416be3..000000000 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ /dev/null @@ -1,465 +0,0 @@ -# Standard Library -import os -import json -import time -from datetime import datetime -from pathlib import Path -import pstats -import atexit - -# Third Party -import tensorflow as tf -import pytest - -# First Party -import smdebug.tensorflow as smd -from smdebug.core.collection import CollectionKeys -from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter -from smdebug.profiler.profiler_config_parser import ProfilerConfigParser -from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX -from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents -from smdebug.tensorflow import KerasHook as Hook -from smdebug.profiler.profiler_constants import ( - CONVERT_TO_MICROSECS, - DEFAULT_PREFIX, - TRACE_DIRECTORY_FORMAT, - CPROFILE_NAME, - CPROFILE_STATS_FILENAME, - PYINSTRUMENT_HTML_FILENAME, - PYINSTRUMENT_JSON_FILENAME, - PYINSTRUMENT_NAME, -) -from smdebug.profiler.python_profiler import PythonProfiler -from smdebug.profiler.python_profiler import ( - PyinstrumentPythonProfiler, - cProfilePythonProfiler, - cProfileTimer, -) -from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase - -@pytest.fixture() -def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture -def test_framework(): - return "test-framework" - - -@pytest.fixture() -def cprofile_python_profiler(out_dir, test_framework): - return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) - - -@pytest.fixture() -def pyinstrument_python_profiler(out_dir, test_framework): - return PyinstrumentPythonProfiler(out_dir, test_framework) - - -@pytest.fixture() -def framework_dir(out_dir, test_framework): - return "{0}/framework/{1}".format(out_dir, test_framework) - - -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig - python_profiler = None - if profiler_config_parser.profiling_enabled: - config = profiler_config_parser.config - if config.python_profiling_config.is_enabled(): - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") - python_profiler.start_profiling(StepPhase.START) - atexit.register(python_profiler.stop_profiling, StepPhase.END) - return profiler_config_parser, python_profiler - - -def create_model(): - model = tf.keras.models.Sequential( - [ - # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 - tf.keras.layers.Flatten(input_shape=(28, 28, 1)), - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dropout(0.2), - tf.keras.layers.Dense(10, activation="softmax"), - ] - ) - return model - - -def helper_native_tf2_profiler_debugger(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with hook.wrap_tape(tf.GradientTape()) as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - # model.save(trial_dir, save_format="tf") - - trial = smd.create_trial(trial_dir) - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ - "weights/dense/kernel:0", - "weights/dense_1/kernel:0", - ] - assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ - "weights/dense/bias:0", - "weights/dense_1/bias:0", - ] - assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] - - -def helper_native_tf2_profiler(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with tf.GradientTape() as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -# @pytest.mark.parametrize("use_pyinstrument", [False, True]) -# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) -def test_native_python_profiling_cprofiler( - out_dir, tf2_python_cprofiler_config_parser_by_step -): - assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) - - config = profiler_config_parser.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) - print('\ntest function: ', python_profiler) - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_native_python_profiling_pyinstrument( - out_dir, tf2_python_pyinstrument_config_parser_by_step -): - assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = PYINSTRUMENT_NAME - allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_create_timeline_file(simple_profiler_config_parser, out_dir): - """ - This test is meant to test successful creation of the timeline file according to file path specification. - $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ - {$ENV_NODE_ID_4digits0padded}_pythontimeline.json - It reads backs the file contents to make sure it is in valid JSON format. - """ - assert simple_profiler_config_parser.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): - files.append(path) - - assert len(files) == 1 - - file_ts = files[0].name.split("_")[0] - folder_name = files[0].parent.name - assert folder_name == time.strftime( - TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) - ) - assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( - TRACE_DIRECTORY_FORMAT - ) - - with open(files[0]) as timeline_file: - events_dict = json.load(timeline_file) - - assert events_dict \ No newline at end of file From 4b6450bef88351a8a7cdfcee98b128ff93ff01f5 Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 15:25:40 -0700 Subject: [PATCH 76/97] Modify distributed_training_utils.py import for TF 2.4 (#422) --- smdebug/tensorflow/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index af9a0e901..11d700dfc 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -427,6 +427,10 @@ def is_tf_version_greater_than_2_4_x(): return version.parse("2.4.0") <= TF_VERSION +def is_tf_version_greater_than_2_4_x(): + return version.parse("2.4.0") <= version.parse(tf.__version__) + + def is_profiler_supported_for_tf_version(): # Profiler Support Added For TF Versions 2.2.0 And Greater return version.parse("2.2.0") <= TF_VERSION From 4b7644db8030c487142a68ecb195fe14e361aa8e Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 19:48:15 -0800 Subject: [PATCH 77/97] remove print statement --- smdebug/tensorflow/keras.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 27ef87658..da1ae76a0 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1185,7 +1185,6 @@ def run(*args, **kwargs): self._increment_step() - if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() @@ -1354,7 +1353,7 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - print("Step Number in start train batch: ", self.mode_steps[mode]) + # print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() From f1d64862f04f5cee4720f719f235407d30b4505c Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:32:23 -0800 Subject: [PATCH 78/97] add changes for enabling profiling for native tf2 training --- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py new file mode 100644 index 000000000..e69de29bb From 865a740856f231244dc77f486f13e35c9cac1f92 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 5 Jan 2021 15:42:55 -0800 Subject: [PATCH 79/97] add changes for enabling profiling in the native tf2 training --- smdebug/tensorflow/keras.py | 2 - .../tensorflow2/test_native_tf2_profiling.py | 144 ++++++++++++++++++ 2 files changed, 144 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index da1ae76a0..2a9d7b8e0 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1300,7 +1300,6 @@ def wrap_tape(self, tape): :return: Wrapped tape of same type as passed. This tape should be used for training """ - # Third Party from tensorflow.python.eager.backprop import GradientTape self.debugger_native_training = True @@ -1414,4 +1413,3 @@ def profiling_end(self): self.close() self._end_dataloader_profiling() self._end_detailed_profiling() - diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index e69de29bb..16b9e121a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -0,0 +1,144 @@ +# Standard Library +import os +from pathlib import Path + +# Third Party +import tensorflow as tf +import pytest +# from tests.tensorflow2.test_keras import helper_keras_fit + +# First Party +import smdebug.tensorflow as smd +from smdebug.core.collection import CollectionKeys +from smdebug.profiler.profiler_config_parser import ProfilerConfigParser +from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX +from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents +from smdebug.tensorflow import KerasHook as Hook + +@pytest.fixture() +def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +def create_hook(trial_dir): + hook = smd.KerasHook(trial_dir, save_all=True) + return hook + + +def create_model(): + model = tf.keras.models.Sequential( + [ + # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 + tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.2), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) + return model + +def helper_native_tf2(): + + +def test_gradtape_tf_function(out_dir): + def get_grads(images, labels): + # with tf.GradientTape() as tape: + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(64) + model = create_model() + hook = create_hook(out_dir) + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + dataset_labels = labels + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with hook.wrap_tape(tf.GradientTape()) as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + model.save(out_dir, save_format="tf") + + + trial = smd.create_trial(out_dir) + assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] + assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ + "weights/dense/kernel:0", + "weights/dense_1/kernel:0", + ] + assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ + "weights/dense/bias:0", + "weights/dense_1/bias:0", + ] + assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ + "Adam/beta_1:0", + "Adam/beta_2:0", + "Adam/decay:0", + "Adam/iter:0", + "Adam/learning_rate:0", + ] + assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] + assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] + + +def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) + hook.close() + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 \ No newline at end of file From cc8d4e788deee0a83237edc95e119efe94cd1daa Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 11 Jan 2021 11:08:36 -0800 Subject: [PATCH 80/97] add tests --- ...filer_cprofiler_config_parser_by_step.json | 1 + ...er_pyinstrument_config_parser_by_step.json | 1 + .../tensorflow2/test_native_tf2_profiling.py | 349 +++++++++++++++++- 3 files changed, 331 insertions(+), 20 deletions(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index e51c386c2..f06218f77 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -5,3 +5,4 @@ "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } } + diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index 53ac1485e..ad5a555f7 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -5,3 +5,4 @@ "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } + diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 16b9e121a..3e1f08b2b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -1,19 +1,40 @@ # Standard Library import os +import json +import time +from datetime import datetime from pathlib import Path +import pstats # Third Party import tensorflow as tf import pytest -# from tests.tensorflow2.test_keras import helper_keras_fit # First Party import smdebug.tensorflow as smd from smdebug.core.collection import CollectionKeys +from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter from smdebug.profiler.profiler_config_parser import ProfilerConfigParser from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents from smdebug.tensorflow import KerasHook as Hook +from smdebug.profiler.profiler_constants import ( + CONVERT_TO_MICROSECS, + DEFAULT_PREFIX, + TRACE_DIRECTORY_FORMAT, + CPROFILE_NAME, + CPROFILE_STATS_FILENAME, + PYINSTRUMENT_HTML_FILENAME, + PYINSTRUMENT_JSON_FILENAME, + PYINSTRUMENT_NAME, +) +from smdebug.profiler.python_profiler import PythonProfiler +from smdebug.profiler.python_profiler import ( + PyinstrumentPythonProfiler, + cProfilePythonProfiler, + cProfileTimer, +) +from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase @pytest.fixture() def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): @@ -22,6 +43,20 @@ def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): return ProfilerConfigParser() +@pytest.fixture() +def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + +@pytest.fixture() +def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): + config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") + monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) + return ProfilerConfigParser() + + @pytest.fixture() def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") @@ -29,9 +64,24 @@ def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): return ProfilerConfigParser() -def create_hook(trial_dir): - hook = smd.KerasHook(trial_dir, save_all=True) - return hook +@pytest.fixture +def test_framework(): + return "test-framework" + + +@pytest.fixture() +def cprofile_python_profiler(out_dir, test_framework): + return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) + + +@pytest.fixture() +def pyinstrument_python_profiler(out_dir, test_framework): + return PyinstrumentPythonProfiler(out_dir, test_framework) + + +@pytest.fixture() +def framework_dir(out_dir, test_framework): + return "{0}/framework/{1}".format(out_dir, test_framework) def create_model(): @@ -46,12 +96,10 @@ def create_model(): ) return model -def helper_native_tf2(): +def helper_native_tf2_profiler_debugger(trial_dir, hook): -def test_gradtape_tf_function(out_dir): def get_grads(images, labels): - # with tf.GradientTape() as tape: return model(images, training=True) @tf.function @@ -63,32 +111,29 @@ def train_step(images, labels): dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) ) - dataset = dataset.shuffle(1000).batch(64) + dataset = dataset.shuffle(1000).batch(128) model = create_model() - hook = create_hook(out_dir) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - dataset_labels = labels labels = tf.one_hot(labels, depth=10) hook.start_profiling_start_train_batch() with hook.wrap_tape(tf.GradientTape()) as tape: logits = train_step(data, labels) grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) - # hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - # hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - # hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) + hook.save_tensor("inputs", data, CollectionKeys.INPUTS) + hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) + hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(out_dir, save_format="tf") - + model.save(trial_dir, save_format="tf") - trial = smd.create_trial(out_dir) + trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ "weights/dense/kernel:0", @@ -109,7 +154,40 @@ def train_step(images, labels): assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] -def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config_parser_by_step, out_dir): +def helper_native_tf2_profiler(trial_dir, hook): + + def get_grads(images, labels): + return model(images, training=True) + + @tf.function + def train_step(images, labels): + return tf.reduce_mean(get_grads(images, labels)) + + mnist = tf.keras.datasets.mnist + (x_train, y_train), _ = mnist.load_data() + dataset = tf.data.Dataset.from_tensor_slices( + (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) + ) + dataset = dataset.shuffle(1000).batch(128) + model = create_model() + opt = tf.keras.optimizers.Adam() + hook.wrap_optimizer(opt) + + n_epochs = 1 + for epoch in range(n_epochs): + for data, labels in dataset: + labels = tf.one_hot(labels, depth=10) + hook.start_profiling_start_train_batch() + with tf.GradientTape() as tape: + logits = train_step(data, labels) + grads = tape.gradient(logits, model.variables) + opt.apply_gradients(zip(grads, model.variables)) + hook.start_profiling_end_train_batch() + hook.stop_profiling_end_of_training() + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): """ This test executes a TF2 native training script, enables detailed TF profiling by step, and verifies the number of events. @@ -117,8 +195,7 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config assert tf2_profiler_config_parser_by_step.profiling_enabled hook = Hook(out_dir=out_dir) - helper_native_tf2(trial_dir=out_dir, hook=hook, eager=True, steps=["train", "eval", "predict"]) - hook.close() + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -141,4 +218,236 @@ def test_native_tf2_profiler_by_step(set_up_resource_config, tf2_profiler_config # The number of events is varying by a small number on # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 \ No newline at end of file + assert num_trace_events >= 230 + + +def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by step, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_step.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + t_events = TensorboardProfilerEvents() + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 230 + + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + +@pytest.mark.skip_if_non_eager +def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): + """ + This test executes a TF2 native training script, enables detailed TF profiling by time, and + verifies the number of events. + """ + assert tf2_profiler_config_parser_by_time.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # get tensorboard timeline files + files = [] + for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( + f"*{TENSORBOARDTIMELINE_SUFFIX}" + ): + files.append(path) + + assert len(files) == 1 + + trace_file = str(files[0]) + t_events = TensorboardProfilerEvents() + + t_events.read_events_from_file(trace_file) + + all_trace_events = t_events.get_all_events() + num_trace_events = len(all_trace_events) + + print(f"Number of events read = {num_trace_events}") + + # The number of events is varying by a small number on + # consecutive runs. Hence, the approximation in the below asserts. + assert num_trace_events >= 700 + + +# @pytest.mark.parametrize("use_pyinstrument", [False, True]) +# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) +def test_native_python_profiling_cprofiler( + out_dir, tf2_python_cprofiler_config_parser_by_step +): + assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled + + config = tf2_python_cprofiler_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = CPROFILE_NAME + allowed_files = [CPROFILE_STATS_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_native_python_profiling_pyinstrument( + out_dir, tf2_python_pyinstrument_config_parser_by_step +): + assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled + + config = tf2_python_pyinstrument_config_parser_by_step.config + print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) + print('\nname: ', config.python_profiling_config.profiler_name) + print('\nstart_step: ', config.python_profiling_config.start_step) + print('\nnum_steps: ', config.python_profiling_config.num_steps) + + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + + profiler_name = PYINSTRUMENT_NAME + allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] + python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + # Test that directory and corresponding files exist. + assert os.path.isdir(python_stats_dir) + + for node_id in os.listdir(python_stats_dir): + node_dir_path = os.path.join(python_stats_dir, node_id) + stats_dirs = os.listdir(node_dir_path) + assert len(stats_dirs) == (end_step - start_step) * 2 + 1 + + for stats_dir in stats_dirs: + # Validate that the expected files are in the stats dir + stats_dir_path = os.path.join(node_dir_path, stats_dir) + stats_files = os.listdir(stats_dir_path) + assert set(stats_files) == set(allowed_files) + + # Validate the actual stats files + for stats_file in stats_files: + stats_path = os.path.join(stats_dir_path, stats_file) + if stats_file == CPROFILE_STATS_FILENAME: + assert pstats.Stats(stats_path) + elif stats_file == PYINSTRUMENT_JSON_FILENAME: + with open(stats_path, "r") as f: + assert json.load(f) + + +def test_create_timeline_file(simple_profiler_config_parser, out_dir): + """ + This test is meant to test successful creation of the timeline file according to file path specification. + $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ + {$ENV_NODE_ID_4digits0padded}_pythontimeline.json + It reads backs the file contents to make sure it is in valid JSON format. + """ + assert simple_profiler_config_parser.profiling_enabled + + hook = Hook(out_dir=out_dir) + helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) + + files = [] + for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + files.append(path) + + assert len(files) == 1 + + file_ts = files[0].name.split("_")[0] + folder_name = files[0].parent.name + assert folder_name == time.strftime( + TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) + ) + assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( + TRACE_DIRECTORY_FORMAT + ) + + with open(files[0]) as timeline_file: + events_dict = json.load(timeline_file) + + assert events_dict \ No newline at end of file From d7c4a10be9310fba692473d8a333e52a1349647c Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 12:05:19 -0800 Subject: [PATCH 81/97] add python profiler as attr for kerashook --- smdebug/tensorflow/keras.py | 6 ++---- tests/profiler/tensorflow2/test_native_tf2_profiling.py | 6 +++++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2a9d7b8e0..6c5e3f384 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -61,7 +61,6 @@ if config.python_profiling_config.is_enabled(): python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") python_profiler.start_profiling(StepPhase.START) -# print('prezero-step start profiling object outside: ', python_profiler) class KerasHook(TensorflowBaseHook, tf.keras.callbacks.Callback): @@ -1138,6 +1137,7 @@ def unwrap(func): def close(self): self._cleanup() + if self.python_profiler: self.python_profiler.start_profiling( StepPhase.STEP_END, @@ -1146,6 +1146,7 @@ def close(self): ) self.debugger_native_training = False + def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1368,7 +1369,6 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self.is_dataloader_profiling = False if self.python_profiler: - # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1377,7 +1377,6 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): if self.profiler_config_parser.should_save_metrics( MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] ): - # print("Start python profiling in start train batch") self.python_profiler.start_profiling( StepPhase.STEP_START, start_mode=mode_keys_to_python_profile_mode(mode), @@ -1404,7 +1403,6 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): step_num=str(self.mode_steps[mode]), ) - def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 3e1f08b2b..9ba8dd60b 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -173,6 +173,8 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) + + # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -338,6 +340,7 @@ def test_native_python_profiling_cprofiler( print('\nnum_steps: ', config.python_profiling_config.num_steps) python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -347,6 +350,7 @@ def test_native_python_profiling_cprofiler( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. @@ -384,7 +388,7 @@ def test_native_python_profiling_pyinstrument( print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps From 24704ca6f81b035779a04e9f74a2559d125e6a06 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 12 Jan 2021 16:01:59 -0800 Subject: [PATCH 82/97] add tests --- .../tensorflow2/test_native_tf2_profiling.py | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py index 9ba8dd60b..9a7416be3 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiling.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path import pstats +import atexit # Third Party import tensorflow as tf @@ -84,6 +85,18 @@ def framework_dir(out_dir, test_framework): return "{0}/framework/{1}".format(out_dir, test_framework) +def set_up_profiling(profilerconfig): + profiler_config_parser = profilerconfig + python_profiler = None + if profiler_config_parser.profiling_enabled: + config = profiler_config_parser.config + if config.python_profiling_config.is_enabled(): + python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + python_profiler.start_profiling(StepPhase.START) + atexit.register(python_profiler.stop_profiling, StepPhase.END) + return profiler_config_parser, python_profiler + + def create_model(): model = tf.keras.models.Sequential( [ @@ -131,7 +144,7 @@ def train_step(images, labels): hook.start_profiling_end_train_batch() hook.stop_profiling_end_of_training() - model.save(trial_dir, save_format="tf") + # model.save(trial_dir, save_format="tf") trial = smd.create_trial(trial_dir) assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] @@ -173,8 +186,6 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) - - # print('\nTraining script: ', hook.python_profiler) n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -196,7 +207,7 @@ def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parse """ assert tf2_profiler_config_parser_by_step.profiling_enabled - hook = Hook(out_dir=out_dir) + hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) t_events = TensorboardProfilerEvents() @@ -333,13 +344,13 @@ def test_native_python_profiling_cprofiler( ): assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - config = tf2_python_cprofiler_config_parser_by_step.config + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) + + config = profiler_config_parser.config print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) print('\nname: ', config.python_profiling_config.name) print('\nstart_step: ', config.python_profiling_config.start_step) print('\nnum_steps: ', config.python_profiling_config.num_steps) - - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") print('\ntest function: ', python_profiler) start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps @@ -382,13 +393,9 @@ def test_native_python_profiling_pyinstrument( ): assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - config = tf2_python_pyinstrument_config_parser_by_step.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.profiler_name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) + profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - # python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") + config = profiler_config_parser.config start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps @@ -398,6 +405,7 @@ def test_native_python_profiling_pyinstrument( python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) hook = Hook(out_dir=out_dir) + hook.python_profiler = python_profiler helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) # Test that directory and corresponding files exist. From e3fbf91239fa6d480225c8b37ed750fb5dddb838 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 14:16:24 -0800 Subject: [PATCH 83/97] update profiler for native tf training --- smdebug/tensorflow/keras.py | 20 +- ...filer_cprofiler_config_parser_by_step.json | 3 +- ...er_pyinstrument_config_parser_by_step.json | 1 - .../tensorflow2/test_native_tf2_profiling.py | 465 ------------------ 4 files changed, 9 insertions(+), 480 deletions(-) delete mode 100644 tests/profiler/tensorflow2/test_native_tf2_profiling.py diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 6c5e3f384..054341d5a 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -146,16 +146,11 @@ def _is_not_supported(self): self._hook_supported = False elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: try: - if is_tf_version_greater_than_2_4_x(): - # distributed_training_utils.py renamed to distributed_training_utils_v1 in tf 2.4.0 - from tensorflow.python.keras.distribute.distributed_training_utils_v1 import ( - get_distributed_model, - ) - else: - from tensorflow.python.keras.distribute.distributed_training_utils import ( - get_distributed_model, - ) + # Third Party + from tensorflow.python.keras.distribute.distributed_training_utils import ( + get_distributed_model, + ) except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( @@ -1139,6 +1134,7 @@ def close(self): self._cleanup() if self.python_profiler: + # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1146,7 +1142,6 @@ def close(self): ) self.debugger_native_training = False - def _cleanup(self): # Unwrap the tape before closing if self.tape: @@ -1353,7 +1348,7 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - # print("Step Number in start train batch: ", self.mode_steps[mode]) + print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1369,6 +1364,7 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self.is_dataloader_profiling = False if self.python_profiler: + # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1389,7 +1385,6 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - if self._is_not_supported(): return @@ -1411,3 +1406,4 @@ def profiling_end(self): self.close() self._end_dataloader_profiling() self._end_detailed_profiling() + diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index f06218f77..d568b471c 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,5 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} - +} \ No newline at end of file diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json index ad5a555f7..53ac1485e 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json @@ -5,4 +5,3 @@ "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" } } - diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiling.py b/tests/profiler/tensorflow2/test_native_tf2_profiling.py deleted file mode 100644 index 9a7416be3..000000000 --- a/tests/profiler/tensorflow2/test_native_tf2_profiling.py +++ /dev/null @@ -1,465 +0,0 @@ -# Standard Library -import os -import json -import time -from datetime import datetime -from pathlib import Path -import pstats -import atexit - -# Third Party -import tensorflow as tf -import pytest - -# First Party -import smdebug.tensorflow as smd -from smdebug.core.collection import CollectionKeys -from smdebug.core.tfevent.timeline_file_writer import TimelineFileWriter -from smdebug.profiler.profiler_config_parser import ProfilerConfigParser -from smdebug.profiler.profiler_constants import TENSORBOARDTIMELINE_SUFFIX -from smdebug.profiler.tf_profiler_parser import TensorboardProfilerEvents -from smdebug.tensorflow import KerasHook as Hook -from smdebug.profiler.profiler_constants import ( - CONVERT_TO_MICROSECS, - DEFAULT_PREFIX, - TRACE_DIRECTORY_FORMAT, - CPROFILE_NAME, - CPROFILE_STATS_FILENAME, - PYINSTRUMENT_HTML_FILENAME, - PYINSTRUMENT_JSON_FILENAME, - PYINSTRUMENT_NAME, -) -from smdebug.profiler.python_profiler import PythonProfiler -from smdebug.profiler.python_profiler import ( - PyinstrumentPythonProfiler, - cProfilePythonProfiler, - cProfileTimer, -) -from smdebug.profiler.python_profile_utils import PythonProfileModes, StepPhase - -@pytest.fixture() -def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture -def test_framework(): - return "test-framework" - - -@pytest.fixture() -def cprofile_python_profiler(out_dir, test_framework): - return cProfilePythonProfiler(out_dir, test_framework, cProfileTimer.TOTAL_TIME) - - -@pytest.fixture() -def pyinstrument_python_profiler(out_dir, test_framework): - return PyinstrumentPythonProfiler(out_dir, test_framework) - - -@pytest.fixture() -def framework_dir(out_dir, test_framework): - return "{0}/framework/{1}".format(out_dir, test_framework) - - -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig - python_profiler = None - if profiler_config_parser.profiling_enabled: - config = profiler_config_parser.config - if config.python_profiling_config.is_enabled(): - python_profiler = PythonProfiler.get_python_profiler(config, "tensorflow") - python_profiler.start_profiling(StepPhase.START) - atexit.register(python_profiler.stop_profiling, StepPhase.END) - return profiler_config_parser, python_profiler - - -def create_model(): - model = tf.keras.models.Sequential( - [ - # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 - tf.keras.layers.Flatten(input_shape=(28, 28, 1)), - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dropout(0.2), - tf.keras.layers.Dense(10, activation="softmax"), - ] - ) - return model - - -def helper_native_tf2_profiler_debugger(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with hook.wrap_tape(tf.GradientTape()) as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.save_tensor("inputs", data, CollectionKeys.INPUTS) - hook.save_tensor("logits", logits, CollectionKeys.OUTPUTS) - hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - # model.save(trial_dir, save_format="tf") - - trial = smd.create_trial(trial_dir) - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert trial.tensor_names(collection=CollectionKeys.WEIGHTS) == [ - "weights/dense/kernel:0", - "weights/dense_1/kernel:0", - ] - assert trial.tensor_names(collection=CollectionKeys.BIASES) == [ - "weights/dense/bias:0", - "weights/dense_1/bias:0", - ] - assert trial.tensor_names(collection=CollectionKeys.OPTIMIZER_VARIABLES) == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] - - -def helper_native_tf2_profiler(trial_dir, hook): - - def get_grads(images, labels): - return model(images, training=True) - - @tf.function - def train_step(images, labels): - return tf.reduce_mean(get_grads(images, labels)) - - mnist = tf.keras.datasets.mnist - (x_train, y_train), _ = mnist.load_data() - dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64)) - ) - dataset = dataset.shuffle(1000).batch(128) - model = create_model() - opt = tf.keras.optimizers.Adam() - hook.wrap_optimizer(opt) - - n_epochs = 1 - for epoch in range(n_epochs): - for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) - hook.start_profiling_start_train_batch() - with tf.GradientTape() as tape: - logits = train_step(data, labels) - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.start_profiling_end_train_batch() - hook.stop_profiling_end_of_training() - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler_debugger(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler_debugger(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler_debugger(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -# @pytest.mark.parametrize("use_pyinstrument", [False, True]) -# @pytest.mark.parametrize("steps", [(1, 2), (1, 5)]) -def test_native_python_profiling_cprofiler( - out_dir, tf2_python_cprofiler_config_parser_by_step -): - assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_cprofiler_config_parser_by_step) - - config = profiler_config_parser.config - print('\npath: ', os.environ["SMPROFILER_CONFIG_PATH"]) - print('\nname: ', config.python_profiling_config.name) - print('\nstart_step: ', config.python_profiling_config.start_step) - print('\nnum_steps: ', config.python_profiling_config.num_steps) - print('\ntest function: ', python_profiler) - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_native_python_profiling_pyinstrument( - out_dir, tf2_python_pyinstrument_config_parser_by_step -): - assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(tf2_python_pyinstrument_config_parser_by_step) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = PYINSTRUMENT_NAME - allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, 'framework/', 'tensorflow/', profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -def test_create_timeline_file(simple_profiler_config_parser, out_dir): - """ - This test is meant to test successful creation of the timeline file according to file path specification. - $ENV_BASE_FOLDER/framework/pevents/$START_TIME_YYMMDDHR/$FILEEVENTSTARTTIMEUTCINEPOCH_ - {$ENV_NODE_ID_4digits0padded}_pythontimeline.json - It reads backs the file contents to make sure it is in valid JSON format. - """ - assert simple_profiler_config_parser.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_profiler(trial_dir=out_dir, hook=hook) - - files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): - files.append(path) - - assert len(files) == 1 - - file_ts = files[0].name.split("_")[0] - folder_name = files[0].parent.name - assert folder_name == time.strftime( - TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) - ) - assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( - TRACE_DIRECTORY_FORMAT - ) - - with open(files[0]) as timeline_file: - events_dict = json.load(timeline_file) - - assert events_dict \ No newline at end of file From 1a1029c6a0339d285c7657f61192665d398c6c5b Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 15:25:40 -0700 Subject: [PATCH 84/97] Modify distributed_training_utils.py import for TF 2.4 (#422) --- smdebug/tensorflow/keras.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 054341d5a..45751bdec 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -146,11 +146,15 @@ def _is_not_supported(self): self._hook_supported = False elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: try: - - # Third Party - from tensorflow.python.keras.distribute.distributed_training_utils import ( - get_distributed_model, - ) + if is_tf_version_greater_than_2_4_x(): + # distributed_training_utils.py renamed to distributed_training_utils_v1 in tf 2.4.0 + from tensorflow.python.keras.distribute.distributed_training_utils_v1 import ( + get_distributed_model, + ) + else: + from tensorflow.python.keras.distribute.distributed_training_utils import ( + get_distributed_model, + ) except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( From 7fbf9345622e2e7577911a00d956a687fa582e55 Mon Sep 17 00:00:00 2001 From: Nihal Harish Date: Wed, 13 Jan 2021 16:27:58 -0700 Subject: [PATCH 85/97] Cache TF Versions (#421) --- smdebug/tensorflow/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index 11d700dfc..c26247229 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -428,7 +428,7 @@ def is_tf_version_greater_than_2_4_x(): def is_tf_version_greater_than_2_4_x(): - return version.parse("2.4.0") <= version.parse(tf.__version__) + return version.parse("2.4.0") <= TF_VERSION def is_profiler_supported_for_tf_version(): From 61d0fa3c7a85db22628fe5978b1df2bbea0176fd Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 18 Jan 2021 19:48:15 -0800 Subject: [PATCH 86/97] remove print statement --- smdebug/tensorflow/keras.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 45751bdec..6653ec63d 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1136,7 +1136,7 @@ def unwrap(func): def close(self): self._cleanup() - + # print("\nStep Number in the close function: ", self.step) if self.python_profiler: # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( @@ -1184,6 +1184,8 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() + # print("\nStep number in the push tape: ", self.step) + if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1274,6 +1276,7 @@ def run(*args, **kwargs): return self.last_saved_step = self.step + # print("\nStep number in the pop tape: ", self.step) return run @@ -1352,7 +1355,7 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - print("Step Number in start train batch: ", self.mode_steps[mode]) + # print("Step Number in start train batch: ", self.mode_steps[mode]) self.profiler_config_parser.load_config() @@ -1389,6 +1392,7 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. """ + # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1406,6 +1410,7 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ + # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() self._end_dataloader_profiling() From 797de1bd25ca2f6ea4b4386b3efb017575804d5d Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 22:02:39 -0800 Subject: [PATCH 87/97] clean up the code --- smdebug/tensorflow/keras.py | 38 ++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 6653ec63d..51154dc48 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -1184,8 +1184,6 @@ def run(*args, **kwargs): self.prepared_collections = True self._increment_step() - # print("\nStep number in the push tape: ", self.step) - if self._get_collections_to_save_for_step(): # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) @@ -1276,7 +1274,6 @@ def run(*args, **kwargs): return self.last_saved_step = self.step - # print("\nStep number in the pop tape: ", self.step) return run @@ -1348,6 +1345,8 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self.set_mode(mode) + # When only profiler is enabled in the native tf2 training, + # increasing the step number in the TRAIN and GLOBAL mode. if not self.debugger_native_training: self.step += 1 self.mode_steps[self.mode] += 1 @@ -1355,8 +1354,6 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - # print("Step Number in start train batch: ", self.mode_steps[mode]) - self.profiler_config_parser.load_config() if self.profiler_config_parser.should_save_metrics( @@ -1371,7 +1368,6 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self.is_dataloader_profiling = False if self.python_profiler: - # print("Stop python profiling in start train batch") self.python_profiler.stop_profiling( StepPhase.STEP_START, end_mode=mode_keys_to_python_profile_mode(mode), @@ -1386,13 +1382,39 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): start_step=self.mode_steps[mode], ) + if is_profiler_supported_for_tf_version(): + if self.profiler_config_parser.should_save_metrics( + MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] + ): + if not self.is_detailed_profiling: + self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( + self.profiler_config_parser.config.local_path, + "tensorflow", + self.mode_steps[mode], + ) + self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") + if not self.warm_up_completed: + # warming up profiler before it will be profiling. + self.tf_profiler.warmup() + self.warm_up_completed = True + self.tf_profiler.start(self._log_dir) + self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS + self.is_detailed_profiling = True + elif self.is_detailed_profiling: + self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") + stop_tf_profiler( + tf_profiler=self.tf_profiler, + log_dir=self._log_dir, + start_time_us=self.tf_profiler_start_time_in_micros, + ) + self.is_detailed_profiling = False +>>>>>>> clean up the code def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. """ - # print("Step Number in end train batch: ", self.mode_steps[mode]) if self._is_not_supported(): return @@ -1410,9 +1432,7 @@ def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ - # print("Step Number at the end of training: ", self.step) # Unwrap the tape before closing and close the python profiling self.close() self._end_dataloader_profiling() self._end_detailed_profiling() - From 2462ec8074d0a85010ffe5aa06d36813a3373d91 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 23:22:21 -0800 Subject: [PATCH 88/97] clean up code --- smdebug/tensorflow/keras.py | 6 +----- smdebug/tensorflow/utils.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 51154dc48..2f0501c3c 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -105,7 +105,6 @@ def __init__( # Profiling vars self.tf_profiler = None if is_profiler_supported_for_tf_version(): - # Third Party from tensorflow.python.profiler import profiler_v2 as tf_profiler self.tf_profiler = tf_profiler @@ -337,7 +336,6 @@ def _create_tensors_for_matching_collections( def _get_distributed_model(self, mode): # not available in tf 1.13, code shouldn't reach here for 1.13 # because of _is_not_supported - # Third Party from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) @@ -1136,9 +1134,8 @@ def unwrap(func): def close(self): self._cleanup() - # print("\nStep Number in the close function: ", self.step) + if self.python_profiler: - # print("python profiling for end of last train step to end of training") self.python_profiler.start_profiling( StepPhase.STEP_END, start_mode=mode_keys_to_python_profile_mode(self.mode), @@ -1186,7 +1183,6 @@ def run(*args, **kwargs): self._increment_step() if self._get_collections_to_save_for_step(): - # print('\n Collections saved for this step: ', self._get_collections_to_save_for_step()) self._initialize_writers() if self.last_saved_step is not None and self._exported_collections is False: diff --git a/smdebug/tensorflow/utils.py b/smdebug/tensorflow/utils.py index c26247229..af9a0e901 100644 --- a/smdebug/tensorflow/utils.py +++ b/smdebug/tensorflow/utils.py @@ -427,10 +427,6 @@ def is_tf_version_greater_than_2_4_x(): return version.parse("2.4.0") <= TF_VERSION -def is_tf_version_greater_than_2_4_x(): - return version.parse("2.4.0") <= TF_VERSION - - def is_profiler_supported_for_tf_version(): # Profiler Support Added For TF Versions 2.2.0 And Greater return version.parse("2.2.0") <= TF_VERSION From ed694bd0b66959db0019ee37e1ab4befe512b5b9 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 19 Jan 2021 23:56:45 -0800 Subject: [PATCH 89/97] update format --- ...est_tf2_python_profiler_cprofiler_config_parser_by_step.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json index d568b471c..e51c386c2 100644 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json @@ -4,4 +4,4 @@ "LocalPath": "/tmp/test", "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" } -} \ No newline at end of file +} From c8d47c996b3fb1fdce84e3bbe1a605ae8ebb2828 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Fri, 22 Jan 2021 00:13:07 -0800 Subject: [PATCH 90/97] update _on_any_mode_end() func for the posthookclose python profiling --- smdebug/tensorflow/keras.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 2f0501c3c..835a0b78f 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -845,6 +845,7 @@ def _on_any_mode_end(self, mode): self._end_phase_python_profiling(mode=mode) self._end_detailed_profiling() + self._end_dataloader_profiling() def on_train_end(self, logs=None): self._on_any_mode_end(ModeKeys.TRAIN) From e0df694135bee800a7610b8f12ad78f6caecffc8 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Fri, 22 Jan 2021 11:11:03 -0800 Subject: [PATCH 91/97] rename the debugger native training flag and update the path join in the unit test --- smdebug/tensorflow/keras.py | 17 ++--------------- .../tensorflow2/test_native_tf2_profiler.py | 7 ++++++- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 835a0b78f..cbf4a76d5 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -122,10 +122,6 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag is used to handle step number increment in the tensorflow native training when profiler is on - # it indicates to profiling for tensorflow2 native training - self.profiling_native_training = False - if self.python_profiler: atexit.register(self.python_profiler.stop_profiling, StepPhase.END) @@ -1142,7 +1138,7 @@ def close(self): start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - self.debugger_native_training = False + self.is_debugger_enabled_for_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1299,7 +1295,7 @@ def wrap_tape(self, tape): """ from tensorflow.python.eager.backprop import GradientTape - self.debugger_native_training = True + self.is_debugger_enabled_for_native_training = True self.set_mode(ModeKeys.TRAIN) if isinstance(tape, GradientTape): @@ -1342,15 +1338,6 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self.set_mode(mode) - # When only profiler is enabled in the native tf2 training, - # increasing the step number in the TRAIN and GLOBAL mode. - if not self.debugger_native_training: - self.step += 1 - self.mode_steps[self.mode] += 1 - # Increment Global step number irrespective of what mode it is - if self.mode != ModeKeys.GLOBAL: - self.mode_steps[ModeKeys.GLOBAL] = self.step - self.profiler_config_parser.load_config() if self.profiler_config_parser.should_save_metrics( diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index 986be6a46..09ddb1818 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -322,11 +322,16 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step # get tensorboard timeline files files = [] +<<<<<<< HEAD <<<<<<< HEAD for path in Path(profilerconfig.config.local_path + "/framework").rglob( ======= for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( >>>>>>> update profiler for native tf training +======= + + for path in Path(os.path.join(profilerconfig.config.local_path + "/framework")).rglob( +>>>>>>> rename the debugger native training flag and update the path join in the unit test f"*{TENSORBOARDTIMELINE_SUFFIX}" ): files.append(path) @@ -538,7 +543,7 @@ def test_create_timeline_file(simple_profiler_config_parser, out_dir): >>>>>>> update profiler for native tf training files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): + for path in Path(os.path.join(out_dir + "/" + DEFAULT_PREFIX)).rglob("*.json"): files.append(path) assert len(files) == 1 From defa97f91807542690326a16b2c349dae491effa Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 25 Jan 2021 13:48:01 -0800 Subject: [PATCH 92/97] update format --- smdebug/tensorflow/keras.py | 83 ++++++------------- .../tensorflow2/test_native_tf2_profiler.py | 7 ++ 2 files changed, 34 insertions(+), 56 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index cbf4a76d5..125f0a362 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -122,6 +122,10 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False + # this flag indicates to debugging for tensorflow2 native training + # self.is_debugger_enabled_for_native_training = False + self.is_profiler_enabled_for_native_training = False + if self.python_profiler: atexit.register(self.python_profiler.stop_profiling, StepPhase.END) @@ -711,6 +715,16 @@ def _remove_fetches_and_callbacks(self, mode): x.fetch_callbacks.pop(tf_obj) self._fetches_added.clear() + def _decrement_step(self): + # Called when both profiler and debugger are enabled in the native training loop + # to adjust the step number + self.step -= 1 + self.mode_steps[self.mode] -= 1 + + # Increment Global step number irrespective of what mode it is + if self.mode != ModeKeys.GLOBAL: + self.mode_steps[ModeKeys.GLOBAL] = self.step + def _start_phase_python_profiling(self, mode): if self.python_profiler: self.python_profiler.stop_profiling( @@ -1138,7 +1152,7 @@ def close(self): start_mode=mode_keys_to_python_profile_mode(self.mode), start_step=self.mode_steps[self.mode], ) - self.is_debugger_enabled_for_native_training = False + self.is_profiler_enabled_for_native_training = False def _cleanup(self): # Unwrap the tape before closing @@ -1295,9 +1309,14 @@ def wrap_tape(self, tape): """ from tensorflow.python.eager.backprop import GradientTape - self.is_debugger_enabled_for_native_training = True self.set_mode(ModeKeys.TRAIN) + # When both profiler and debugger are enabled in the native training, step number is firstly increased by 1 in + # the profiling_start_batch() function, and should be decreased by 1 here in order to keep the step number + # correct when calling _increment_step() function inside _wrap_push_tape() function. + if self.is_profiler_enabled_for_native_training: + self._decrement_step() + if isinstance(tape, GradientTape): # unwrap tape before wrapping new tape to avoid recursive wrap tapes if self.tape: @@ -1338,61 +1357,13 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self.set_mode(mode) - self.profiler_config_parser.load_config() + self.is_profiler_enabled_for_native_training = True - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] - ) and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_START_FLAG_FILENAME - ): - self.is_dataloader_profiling = True - elif self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( - TF_DATALOADER_END_FLAG_FILENAME - ): - self.is_dataloader_profiling = False - - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) - - if is_profiler_supported_for_tf_version(): - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] - ): - if not self.is_detailed_profiling: - self._log_dir = TraceFileLocation.get_detailed_profiling_log_dir( - self.profiler_config_parser.config.local_path, - "tensorflow", - self.mode_steps[mode], - ) - self.logger.info(f"Enabling TF profiler on step: = {self.mode_steps[mode]}") - if not self.warm_up_completed: - # warming up profiler before it will be profiling. - self.tf_profiler.warmup() - self.warm_up_completed = True - self.tf_profiler.start(self._log_dir) - self.tf_profiler_start_time_in_micros = time.time() * CONVERT_TO_MICROSECS - self.is_detailed_profiling = True - elif self.is_detailed_profiling: - self.logger.info(f"Disabling TF profiler on step: ={self.mode_steps[mode]}") - stop_tf_profiler( - tf_profiler=self.tf_profiler, - log_dir=self._log_dir, - start_time_us=self.tf_profiler_start_time_in_micros, - ) - self.is_detailed_profiling = False ->>>>>>> clean up the code + # When only profiler is enabled in the native tf2 training, + # increasing the step number in the TRAIN and GLOBAL mode + # and not writing the state. + if self.is_profiler_enabled_for_native_training: + self._increment_step(write_state=False) def profiling_end_batch(self, mode=ModeKeys.TRAIN): diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index 09ddb1818..474a8556a 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -250,10 +250,14 @@ def train_step(images, labels): for epoch in range(n_epochs): for data, labels in dataset: labels = tf.one_hot(labels, depth=10) + hook.profiling_start_batch() if debugger: with hook.wrap_tape(tf.GradientTape()) as tape: +<<<<<<< HEAD <<<<<<< HEAD hook.profiling_start_batch() +======= +>>>>>>> update format logits = train_step(data, labels) if python_profiler and start_step <= step < end_step: assert python_profiler._start_step == step @@ -271,8 +275,11 @@ def train_step(images, labels): hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) else: with tf.GradientTape() as tape: +<<<<<<< HEAD <<<<<<< HEAD hook.profiling_start_batch() +======= +>>>>>>> update format logits = train_step(data, labels) if python_profiler and start_step <= step < end_step: assert python_profiler._start_step == step From ff382d3d39561b86fb6936baadec88621773e6f5 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 25 Jan 2021 14:27:55 -0800 Subject: [PATCH 93/97] update the comments --- smdebug/tensorflow/keras.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 125f0a362..52b9ed2c3 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -122,11 +122,9 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag indicates to debugging for tensorflow2 native training - # self.is_debugger_enabled_for_native_training = False + # this flag indicates to profiler for tensorflow2 native training self.is_profiler_enabled_for_native_training = False - if self.python_profiler: atexit.register(self.python_profiler.stop_profiling, StepPhase.END) From d86e0404b8e5c34e8c0cce75b3b8c1fb958ff846 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 25 Jan 2021 15:00:00 -0800 Subject: [PATCH 94/97] update comments --- smdebug/tensorflow/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 52b9ed2c3..f9e395fbc 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -122,7 +122,7 @@ def __init__( # this flag indicated to the train_batch_begin callback # the the step was already incremented in the on_train_begin callback self.step_incremented_in_on_train_begin = False - # this flag indicates to profiler for tensorflow2 native training + # this flag indicates to profiling for tensorflow2 native training self.is_profiler_enabled_for_native_training = False if self.python_profiler: From d842f8a84f8564f9f4472e38572942916128ebbf Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Mon, 25 Jan 2021 21:13:36 -0800 Subject: [PATCH 95/97] add docstring, update helper function names and improve the unit tests --- smdebug/tensorflow/keras.py | 103 ++++++++-------- .../tensorflow2/test_native_tf2_profiler.py | 112 +++++++++++++----- 2 files changed, 139 insertions(+), 76 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index f9e395fbc..1a1cffe32 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -714,16 +714,25 @@ def _remove_fetches_and_callbacks(self, mode): self._fetches_added.clear() def _decrement_step(self): - # Called when both profiler and debugger are enabled in the native training loop - # to adjust the step number + """ + Called when both profiler and debugger are enabled in the native training loop + to adjust the step number + """ + self.step -= 1 self.mode_steps[self.mode] -= 1 - # Increment Global step number irrespective of what mode it is + # Decrease Global step number irrespective of what mode it is if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - def _start_phase_python_profiling(self, mode): + def _handle_start_python_profiling(self, mode): + """ + This function is called to handle python profiling at the start of a step. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: + """ + if self.python_profiler: self.python_profiler.stop_profiling( StepPhase.STEP_START, @@ -739,7 +748,12 @@ def _start_phase_python_profiling(self, mode): start_step=self.mode_steps[mode], ) - def _end_phase_python_profiling(self, mode): + def _handle_end_python_profiling(self, mode): + """ + This function is called to handle python profiling at the end of a step. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: + """ if self.python_profiler: self.python_profiler.stop_profiling( StepPhase.STEP_END, @@ -755,7 +769,12 @@ def _end_phase_python_profiling(self, mode): start_step=self.mode_steps[mode], ) - def _begin_detailed_profiling(self, mode=ModeKeys.TRAIN): + def _handle_start_detailed_profiling(self, mode=ModeKeys.TRAIN): + """ + This function is called to handle detailed profiling at the start of a step. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: + """ if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( MetricsCategory.DETAILED_PROFILING, self.mode_steps[mode] @@ -783,7 +802,10 @@ def _begin_detailed_profiling(self, mode=ModeKeys.TRAIN): ) self.is_detailed_profiling = False - def _end_detailed_profiling(self): + def _handle_end_detailed_profiling(self): + """ + This function is called to handle detailed profiling at the end of a step. + """ if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: self.logger.info("Disabling profiler, reached end of training.") stop_tf_profiler( @@ -793,7 +815,12 @@ def _end_detailed_profiling(self): ) self.is_detailed_profiling = False - def _begin_dataloader_profiling(self, mode): + def _handle_start_dataloader_profiling(self, mode): + """ + This function is called to handle dataloader profiling at the start of a step. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: + """ if self.profiler_config_parser.should_save_metrics( MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] ) and self.profiler_config_parser.write_tf_dataloader_flag( @@ -805,7 +832,10 @@ def _begin_dataloader_profiling(self, mode): ): self.is_dataloader_profiling = False - def _end_dataloader_profiling(self): + def _handle_end_dataloader_profiling(self): + """ + This function is called to handle dataloader profiling at the end of a step. + """ if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_END_FLAG_FILENAME ): @@ -857,7 +887,7 @@ def _on_any_mode_end(self, mode): def on_train_end(self, logs=None): self._on_any_mode_end(ModeKeys.TRAIN) - self._end_detailed_profiling() + self._handle_end_detailed_profiling() # throws error in keras if this fn is absent def on_test_end(self, logs=None): @@ -903,24 +933,6 @@ def _on_any_batch_begin(self, batch, mode, logs=None): self.profiler_config_parser.load_config() - self._begin_dataloader_profiling(mode=mode) - - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_START, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_START, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) ->>>>>>> add python profiler as attr for kerashook - if self.prepared_collections is False: # sets prepared_collections to True here self._prepare_collections() @@ -953,7 +965,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): def on_train_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.TRAIN, logs=logs) - self._begin_detailed_profiling() + self._handle_start_detailed_profiling() def on_test_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.EVAL, logs=logs) @@ -1047,21 +1059,6 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._exported_model[self.mode] = True - if self.python_profiler: - self.python_profiler.stop_profiling( - StepPhase.STEP_END, - end_mode=mode_keys_to_python_profile_mode(mode), - end_step=self.mode_steps[mode], - ) - if self.profiler_config_parser.should_save_metrics( - MetricsCategory.PYTHON_PROFILING, self.mode_steps[mode] - ): - self.python_profiler.start_profiling( - StepPhase.STEP_END, - start_mode=mode_keys_to_python_profile_mode(mode), - start_step=self.mode_steps[mode], - ) ->>>>>>> add python profiler as attr for kerashook def on_train_batch_end(self, batch, logs=None): self._on_any_batch_end(batch, ModeKeys.TRAIN, logs=logs) @@ -1346,6 +1343,8 @@ def record_tensor_value(self, tensor_name, tensor_value): def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: """ self.start = time.time() @@ -1354,19 +1353,24 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): return self.set_mode(mode) - self.is_profiler_enabled_for_native_training = True # When only profiler is enabled in the native tf2 training, # increasing the step number in the TRAIN and GLOBAL mode # and not writing the state. - if self.is_profiler_enabled_for_native_training: - self._increment_step(write_state=False) + + self._increment_step(write_state=False) + self.profiler_config_parser.load_config() + self._handle_start_dataloader_profiling(mode=mode) + self._handle_start_python_profiling(mode=mode) + self._handle_start_detailed_profiling(mode=mode) def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. + :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT + :return: """ if self._is_not_supported(): return @@ -1381,11 +1385,12 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): step_num=str(self.mode_steps[mode]), ) + def profiling_end(self): """ Stop profiler at the end of training when native tf2 training is used. """ # Unwrap the tape before closing and close the python profiling self.close() - self._end_dataloader_profiling() - self._end_detailed_profiling() + self._handle_end_dataloader_profiling() + self._handle_end_detailed_profiling() diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index 474a8556a..a62951c6e 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -146,6 +146,7 @@ def generate_profiler_config_parser_all_params( return profiler_config_parser +<<<<<<< HEAD def set_up_profiling(profiler_config_parser): profiler_config_parser = profiler_config_parser ======= @@ -193,6 +194,10 @@ def tf2_profiler_config_parser_by_step_all_params(config_folder, monkeypatch): def set_up_profiling(profilerconfig): profiler_config_parser = profilerconfig >>>>>>> update profiler for native tf training +======= +def set_up_profiling(profiler_config): + profiler_config_parser = profiler_config +>>>>>>> add docstring, update helper function names and improve the unit tests python_profiler = None if profiler_config_parser.profiling_enabled: config = profiler_config_parser.config @@ -249,8 +254,8 @@ def train_step(images, labels): n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: - labels = tf.one_hot(labels, depth=10) hook.profiling_start_batch() + labels = tf.one_hot(labels, depth=10) if debugger: with hook.wrap_tape(tf.GradientTape()) as tape: <<<<<<< HEAD @@ -297,7 +302,26 @@ def train_step(images, labels): assert python_profiler._start_phase == StepPhase.STEP_END -def verify_num_trace_events(profilerconfig): +def initiate_python_profiling(profiler_config): + assert profiler_config.profiling_enabled + profiler_config_parser, python_profiler = set_up_profiling(profiler_config) + config = profiler_config_parser.config + start_step = config.python_profiling_config.start_step + num_steps = config.python_profiling_config.num_steps + end_step = start_step + num_steps + return python_profiler, start_step, end_step + + +def train_loop(out_dir, debugger=False, python_profiler=None, start_step=None, end_step=None): + hook = Hook(out_dir=out_dir, save_all=True) + if python_profiler: + hook.python_profiler = python_profiler + helper_native_tf2_gradtape( + hook=hook, debugger=debugger, start_step=start_step, end_step=end_step + ) + + +def verify_num_trace_events(profiler_config): """ This verifies the number of events when detailed profiling is enabled. """ @@ -337,8 +361,12 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step >>>>>>> update profiler for native tf training ======= +<<<<<<< HEAD for path in Path(os.path.join(profilerconfig.config.local_path + "/framework")).rglob( >>>>>>> rename the debugger native training flag and update the path join in the unit test +======= + for path in Path(os.path.join(profiler_config.config.local_path + "/framework")).rglob( +>>>>>>> add docstring, update helper function names and improve the unit tests f"*{TENSORBOARDTIMELINE_SUFFIX}" ): files.append(path) @@ -358,12 +386,15 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step assert num_trace_events >= 230 +<<<<<<< HEAD <<<<<<< HEAD def train_loop(out_dir, debugger=False): hook = Hook(out_dir=out_dir, save_all=True) helper_native_tf2_gradtape(hook=hook, debugger=debugger) +======= +>>>>>>> add docstring, update helper function names and improve the unit tests def verify_tensor_names(out_dir): """ This verifies the tensor names when debugger is enabled. @@ -570,12 +601,17 @@ def test_create_timeline_file(simple_profiler_config_parser, out_dir): assert events_dict +<<<<<<< HEAD <<<<<<< HEAD def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=False): +======= +def verify_python_profiling(profiler_name, out_dir, num_steps): +>>>>>>> add docstring, update helper function names and improve the unit tests """ This executes a TF2 native training script with profiler or both profiler and debugger, enables python profiling by step, and verifies the python profiling's steps and expected output files. """ +<<<<<<< HEAD assert profilerconfig.profiling_enabled profiler_config_parser, python_profiler = set_up_profiling(profilerconfig) @@ -599,6 +635,8 @@ def test_native_tf2_profiler_debugger_all_params( start_step = config.python_profiling_config.start_step num_steps = config.python_profiling_config.num_steps end_step = start_step + num_steps +======= +>>>>>>> add docstring, update helper function names and improve the unit tests <<<<<<< HEAD if profiler_name == CPROFILE_NAME: @@ -608,15 +646,6 @@ def test_native_tf2_profiler_debugger_all_params( allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - hook = Hook(out_dir=out_dir, save_all=True) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape( - hook=hook, - python_profiler=python_profiler, - start_step=start_step, - end_step=end_step, - debugger=debugger, - ) # Test that directory and corresponding files exist. ======= @@ -685,19 +714,33 @@ def test_native_tf2_profiling_debugger( profiler_config_parser = generate_profiler_config_parser( "PythonProfiling", profiler_config_path, (5, 2, CPROFILE_NAME, None) ) - verify_python_profiling(CPROFILE_NAME, out_dir, profiler_config_parser) - verify_timeline_file(out_dir) if enable_python_profiling == PYINSTRUMENT_NAME: profiler_config_parser = generate_profiler_config_parser( "PythonProfiling", profiler_config_path, (10, 3, PYINSTRUMENT_NAME, None) ) - verify_python_profiling(PYINSTRUMENT_NAME, out_dir, profiler_config_parser) - verify_timeline_file(out_dir) + python_profiler, start_step, end_step = initiate_python_profiling( + profiler_config_parser + ) + train_loop( + out_dir, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) + verify_python_profiling( + enable_python_profiling, out_dir, num_steps=end_step - start_step + ) + verify_timeline_file(out_dir) elif enable_detailed_profiling and enable_python_profiling: profiler_config_parser = generate_profiler_config_parser_all_params( profiler_config_path, (4, 2, enable_python_profiling, None), (8, 1, None, None) ) - verify_python_profiling(enable_python_profiling, out_dir, profiler_config_parser) + python_profiler, start_step, end_step = initiate_python_profiling( + profiler_config_parser + ) + train_loop( + out_dir, python_profiler=python_profiler, start_step=start_step, end_step=end_step + ) + verify_python_profiling( + enable_python_profiling, out_dir, num_steps=end_step - start_step + ) verify_num_trace_events(profiler_config_parser) verify_timeline_file(out_dir) else: @@ -716,26 +759,41 @@ def test_native_tf2_profiling_debugger( profiler_config_parser = generate_profiler_config_parser( "PythonProfiling", profiler_config_path, (5, 2, CPROFILE_NAME, None) ) - verify_python_profiling( - CPROFILE_NAME, out_dir, profiler_config_parser, debugger=True - ) - verify_timeline_file(out_dir) - verify_tensor_names(out_dir) if enable_python_profiling == PYINSTRUMENT_NAME: profiler_config_parser = generate_profiler_config_parser( "PythonProfiling", profiler_config_path, (10, 3, PYINSTRUMENT_NAME, None) ) - verify_python_profiling( - PYINSTRUMENT_NAME, out_dir, profiler_config_parser, debugger=True - ) - verify_timeline_file(out_dir) - verify_tensor_names(out_dir) + python_profiler, start_step, end_step = initiate_python_profiling( + profiler_config_parser + ) + train_loop( + out_dir, + debugger=True, + python_profiler=python_profiler, + start_step=start_step, + end_step=end_step, + ) + verify_python_profiling( + enable_python_profiling, out_dir, num_steps=end_step - start_step + ) + verify_timeline_file(out_dir) + verify_tensor_names(out_dir) elif enable_detailed_profiling and enable_python_profiling: profiler_config_parser = generate_profiler_config_parser_all_params( profiler_config_path, (4, 2, enable_python_profiling, None), (8, 1, None, None) ) + python_profiler, start_step, end_step = initiate_python_profiling( + profiler_config_parser + ) + train_loop( + out_dir, + debugger=True, + python_profiler=python_profiler, + start_step=start_step, + end_step=end_step, + ) verify_python_profiling( - enable_python_profiling, out_dir, profiler_config_parser, debugger=True + enable_python_profiling, out_dir, num_steps=end_step - start_step ) verify_num_trace_events(profiler_config_parser) verify_timeline_file(out_dir) From 499d1a3b8ba66938841f163eb5fac3433a6d4850 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Tue, 26 Jan 2021 16:54:06 -0800 Subject: [PATCH 96/97] update docstring and function name --- smdebug/tensorflow/keras.py | 50 ++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 1a1cffe32..5475ebb34 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -722,15 +722,14 @@ def _decrement_step(self): self.step -= 1 self.mode_steps[self.mode] -= 1 - # Decrease Global step number irrespective of what mode it is + # Decrement Global step number irrespective of what mode it is if self.mode != ModeKeys.GLOBAL: self.mode_steps[ModeKeys.GLOBAL] = self.step - def _handle_start_python_profiling(self, mode): + def _handle_start_step_python_profiling(self, mode): """ This function is called to handle python profiling at the start of a step. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if self.python_profiler: @@ -748,11 +747,10 @@ def _handle_start_python_profiling(self, mode): start_step=self.mode_steps[mode], ) - def _handle_end_python_profiling(self, mode): + def _handle_end_step_python_profiling(self, mode): """ This function is called to handle python profiling at the end of a step. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if self.python_profiler: self.python_profiler.stop_profiling( @@ -769,11 +767,10 @@ def _handle_end_python_profiling(self, mode): start_step=self.mode_steps[mode], ) - def _handle_start_detailed_profiling(self, mode=ModeKeys.TRAIN): + def _handle_detailed_profiling(self, mode=ModeKeys.TRAIN): """ - This function is called to handle detailed profiling at the start of a step. + This function is called to handle detailed profiling at the start of a mode. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if is_profiler_supported_for_tf_version(): if self.profiler_config_parser.should_save_metrics( @@ -802,9 +799,9 @@ def _handle_start_detailed_profiling(self, mode=ModeKeys.TRAIN): ) self.is_detailed_profiling = False - def _handle_end_detailed_profiling(self): + def _stop_detailed_profiling(self): """ - This function is called to handle detailed profiling at the end of a step. + This function is called to stop detailed profiling at the end of a mode. """ if is_profiler_supported_for_tf_version() and self.is_detailed_profiling: self.logger.info("Disabling profiler, reached end of training.") @@ -815,11 +812,10 @@ def _handle_end_detailed_profiling(self): ) self.is_detailed_profiling = False - def _handle_start_dataloader_profiling(self, mode): + def _handle_dataloader_profiling(self, mode): """ - This function is called to handle dataloader profiling at the start of a step. + This function is called to handle dataloader profiling at the start of a mode. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if self.profiler_config_parser.should_save_metrics( MetricsCategory.DATALOADER_PROFILING, self.mode_steps[mode] @@ -832,9 +828,9 @@ def _handle_start_dataloader_profiling(self, mode): ): self.is_dataloader_profiling = False - def _handle_end_dataloader_profiling(self): + def _stop_dataloader_profiling(self): """ - This function is called to handle dataloader profiling at the end of a step. + This function is called to stop dataloader profiling at the end of a mode. """ if self.is_dataloader_profiling and self.profiler_config_parser.write_tf_dataloader_flag( TF_DATALOADER_END_FLAG_FILENAME @@ -880,14 +876,15 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): - self._end_phase_python_profiling(mode=mode) self._end_detailed_profiling() self._end_dataloader_profiling() + self._stop_dataloader_profiling() + def on_train_end(self, logs=None): self._on_any_mode_end(ModeKeys.TRAIN) - self._handle_end_detailed_profiling() + self._stop_detailed_profiling() # throws error in keras if this fn is absent def on_test_end(self, logs=None): @@ -965,7 +962,7 @@ def _on_any_batch_begin(self, batch, mode, logs=None): def on_train_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.TRAIN, logs=logs) - self._handle_start_detailed_profiling() + self._handle_detailed_profiling() def on_test_batch_begin(self, batch, logs=None): self._on_any_batch_begin(batch, ModeKeys.EVAL, logs=logs) @@ -1059,7 +1056,6 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._exported_model[self.mode] = True - def on_train_batch_end(self, batch, logs=None): self._on_any_batch_end(batch, ModeKeys.TRAIN, logs=logs) @@ -1306,9 +1302,9 @@ def wrap_tape(self, tape): self.set_mode(ModeKeys.TRAIN) - # When both profiler and debugger are enabled in the native training, step number is firstly increased by 1 in + # When both profiler and debugger are enabled in the native training, step number is increased by 1 in # the profiling_start_batch() function, and should be decreased by 1 here in order to keep the step number - # correct when calling _increment_step() function inside _wrap_push_tape() function. + # consistent since _increment_step() will be called in wrap_push_tape(). if self.is_profiler_enabled_for_native_training: self._decrement_step() @@ -1344,7 +1340,6 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ self.start = time.time() @@ -1361,16 +1356,15 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): self._increment_step(write_state=False) self.profiler_config_parser.load_config() - self._handle_start_dataloader_profiling(mode=mode) - self._handle_start_python_profiling(mode=mode) - self._handle_start_detailed_profiling(mode=mode) + self._handle_dataloader_profiling(mode=mode) + self._handle_start_step_python_profiling(mode=mode) + self._handle_detailed_profiling(mode=mode) def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT - :return: """ if self._is_not_supported(): return @@ -1392,5 +1386,5 @@ def profiling_end(self): """ # Unwrap the tape before closing and close the python profiling self.close() - self._handle_end_dataloader_profiling() - self._handle_end_detailed_profiling() + self._stop_dataloader_profiling() + self._stop_detailed_profiling() From e4aa05e4a9a8d9dc62f1588b00ef2711ff554863 Mon Sep 17 00:00:00 2001 From: sophiayue1116 Date: Thu, 28 Jan 2021 20:44:09 -0800 Subject: [PATCH 97/97] update unit tests --- smdebug/tensorflow/keras.py | 26 +- ...iler_all_params_config_parser_by_step.json | 8 - ...filer_cprofiler_config_parser_by_step.json | 7 - ...er_pyinstrument_config_parser_by_step.json | 7 - .../tensorflow2/test_native_tf2_profiler.py | 488 ++---------------- 5 files changed, 56 insertions(+), 480 deletions(-) delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json delete mode 100644 tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json diff --git a/smdebug/tensorflow/keras.py b/smdebug/tensorflow/keras.py index 5475ebb34..d40b58d98 100644 --- a/smdebug/tensorflow/keras.py +++ b/smdebug/tensorflow/keras.py @@ -152,6 +152,7 @@ def _is_not_supported(self): from tensorflow.python.keras.distribute.distributed_training_utils import ( get_distributed_model, ) + except ImportError: # for tf1.13 we can't import this, so we can't support mirrored strategy self.logger.info( @@ -876,11 +877,19 @@ def on_test_begin(self, logs=None): self._on_any_mode_begin(ModeKeys.EVAL) def _on_any_mode_end(self, mode): - self._end_phase_python_profiling(mode=mode) - self._end_detailed_profiling() - self._end_dataloader_profiling() - self._stop_dataloader_profiling() + if self.python_profiler: + self.python_profiler.stop_profiling( + StepPhase.STEP_END, + end_mode=mode_keys_to_python_profile_mode(mode), + end_step=self.mode_steps[mode], + ) + self.python_profiler.start_profiling( + StepPhase.STEP_END, + start_mode=mode_keys_to_python_profile_mode(mode), + start_step=self.mode_steps[mode], + ) + self._stop_dataloader_profiling() def on_train_end(self, logs=None): self._on_any_mode_end(ModeKeys.TRAIN) @@ -929,6 +938,8 @@ def _on_any_batch_begin(self, batch, mode, logs=None): self.step_incremented_in_on_train_begin = False self.profiler_config_parser.load_config() + self._handle_dataloader_profiling(mode=mode) + self._handle_start_step_python_profiling(mode=mode) if self.prepared_collections is False: # sets prepared_collections to True here @@ -1055,6 +1066,7 @@ def _on_any_batch_end(self, batch, mode, logs=None): self._export_model() self._exported_model[self.mode] = True + self._handle_end_step_python_profiling(mode=mode) def on_train_batch_end(self, batch, logs=None): self._on_any_batch_end(batch, ModeKeys.TRAIN, logs=logs) @@ -1335,13 +1347,11 @@ def record_tensor_value(self, tensor_name, tensor_value): self._initialize_writers(only_initialize_if_missing=True) self._save_for_tensor(tensor_name, tensor_value, check_before_write=False) - def profiling_start_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the start of train batch when native tf2 training is used. :param mode: ModeKeys.TRAIN ModeKeys.EVAL ModeKeys.PREDICT """ - self.start = time.time() if self._is_not_supported(): @@ -1353,14 +1363,12 @@ def profiling_start_batch(self, mode=ModeKeys.TRAIN): # When only profiler is enabled in the native tf2 training, # increasing the step number in the TRAIN and GLOBAL mode # and not writing the state. - self._increment_step(write_state=False) self.profiler_config_parser.load_config() self._handle_dataloader_profiling(mode=mode) self._handle_start_step_python_profiling(mode=mode) self._handle_detailed_profiling(mode=mode) - def profiling_end_batch(self, mode=ModeKeys.TRAIN): """ Enabling profiler at the end of train batch when native tf2 training is used. @@ -1378,7 +1386,7 @@ def profiling_end_batch(self, mode=ModeKeys.TRAIN): pid=os.getpid(), step_num=str(self.mode_steps[mode]), ) - + self._handle_end_step_python_profiling(mode=mode) def profiling_end(self): """ diff --git a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json deleted file mode 100644 index c119eebf8..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_all_params_config_parser_by_step.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "DetailedProfilingConfig": "{\"StartStep\": 2, \"NumSteps\": 2}", - "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"cProfile\"}" - } -} diff --git a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json deleted file mode 100644 index e51c386c2..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_cprofiler_config_parser_by_step.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 12, \"NumSteps\": 3}" - } -} diff --git a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json b/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json deleted file mode 100644 index 53ac1485e..000000000 --- a/tests/core/json_configs/test_tf2_python_profiler_pyinstrument_config_parser_by_step.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "ProfilingParameters": { - "ProfilerEnabled": true, - "LocalPath": "/tmp/test", - "PythonProfilingConfig": "{\"StartStep\": 8, \"NumSteps\": 2, \"ProfilerName\": \"Pyinstrument\"}" - } -} diff --git a/tests/profiler/tensorflow2/test_native_tf2_profiler.py b/tests/profiler/tensorflow2/test_native_tf2_profiler.py index a62951c6e..ba4283f86 100644 --- a/tests/profiler/tensorflow2/test_native_tf2_profiler.py +++ b/tests/profiler/tensorflow2/test_native_tf2_profiler.py @@ -10,6 +10,7 @@ # Third Party import pytest import tensorflow as tf +from tests.profiler.resources.profiler_config_parser_utils import build_metrics_config # First Party import smdebug.tensorflow as smd @@ -32,7 +33,6 @@ from smdebug.tensorflow import KerasHook as Hook -<<<<<<< HEAD @pytest.fixture def profiler_config_path(config_folder, monkeypatch): config_path = os.path.join(config_folder, "profiler_config.json") @@ -42,51 +42,33 @@ def profiler_config_path(config_folder, monkeypatch): os.remove(config_path) -def _convert_to_string(item): - return '"{0}"'.format(item) if isinstance(item, str) else item - - -def _convert_key_and_value(key, value): - return "{0}: {1}, ".format(_convert_to_string(key), _convert_to_string(value)) - - def generate_profiler_config_parser(profiling_type, profiler_config_path, profiling_parameters): - python_profiler_config, detailed_profiler_config = "{}", "{}" + python_profiling_config, detailed_profiling_config = "{}", "{}" if profiling_type == "PythonProfiling": start_step, num_steps, profiler_name, cprofile_timer = profiling_parameters - python_profiler_config = "{" - if start_step is not None: - python_profiler_config += _convert_key_and_value("StartStep", start_step) - if num_steps is not None: - python_profiler_config += _convert_key_and_value("NumSteps", num_steps) - if profiler_name is not None: - python_profiler_config += _convert_key_and_value("ProfilerName", profiler_name) - if cprofile_timer is not None: - python_profiler_config += _convert_key_and_value("cProfileTimer", cprofile_timer) - python_profiler_config += "}" + python_profiling_config = build_metrics_config( + StartStep=start_step, + NumSteps=num_steps, + ProfilerName=profiler_name, + cProfileTimer=cprofile_timer, + ) if profiling_type == "DetailedProfiling": start_step, num_steps, start_time, duration = profiling_parameters - detailed_profiler_config = "{" - if start_step: - detailed_profiler_config += _convert_key_and_value("StartStep", start_step) - if num_steps: - detailed_profiler_config += _convert_key_and_value("NumSteps", num_steps) - if start_time: - detailed_profiler_config += _convert_key_and_value( - "StartTimeInSecSinceEpoch", start_time - ) - if duration: - detailed_profiler_config += _convert_key_and_value("DurationInSeconds", duration) - detailed_profiler_config += "}" + detailed_profiling_config = build_metrics_config( + StartStep=start_step, + NumSteps=num_steps, + StartTimeInSecSinceEpoch=start_time, + DurationInSeconds=duration, + ) full_config = { "ProfilingParameters": { "ProfilerEnabled": True, "LocalPath": "/tmp/test", - "PythonProfilingConfig": python_profiler_config, - "DetailedProfilingConfig": detailed_profiler_config, + "PythonProfilingConfig": python_profiling_config, + "DetailedProfilingConfig": detailed_profiling_config, } } @@ -106,34 +88,26 @@ def generate_profiler_config_parser_all_params( start_step_1, num_steps_1, profiler_name, cprofile_timer = python_profiling_parameters start_step_2, num_steps_2, start_time, duration = detailed_profiling_parameters - python_profiler_config = "{" - if start_step_1 is not None: - python_profiler_config += _convert_key_and_value("StartStep", start_step_1) - if num_steps_1 is not None: - python_profiler_config += _convert_key_and_value("NumSteps", num_steps_1) - if profiler_name is not None: - python_profiler_config += _convert_key_and_value("ProfilerName", profiler_name) - if cprofile_timer is not None: - python_profiler_config += _convert_key_and_value("cProfileTimer", cprofile_timer) - python_profiler_config += "}" - - detailed_profiler_config = "{" - if start_step_2: - detailed_profiler_config += _convert_key_and_value("StartStep", start_step_2) - if num_steps_2: - detailed_profiler_config += _convert_key_and_value("NumSteps", num_steps_2) - if start_time: - detailed_profiler_config += _convert_key_and_value("StartTimeInSecSinceEpoch", start_time) - if duration: - detailed_profiler_config += _convert_key_and_value("DurationInSeconds", duration) - detailed_profiler_config += "}" + python_profiling_config = build_metrics_config( + StartStep=start_step_1, + NumSteps=num_steps_1, + ProfilerName=profiler_name, + cProfileTimer=cprofile_timer, + ) + + detailed_profiling_config = build_metrics_config( + StartStep=start_step_2, + NumSteps=num_steps_2, + StartTimeInSecSinceEpoch=start_time, + DurationInSeconds=duration, + ) full_config = { "ProfilingParameters": { "ProfilerEnabled": True, "LocalPath": "/tmp/test", - "PythonProfilingConfig": python_profiler_config, - "DetailedProfilingConfig": detailed_profiler_config, + "PythonProfilingConfig": python_profiling_config, + "DetailedProfilingConfig": detailed_profiling_config, } } @@ -146,58 +120,8 @@ def generate_profiler_config_parser_all_params( return profiler_config_parser -<<<<<<< HEAD -def set_up_profiling(profiler_config_parser): - profiler_config_parser = profiler_config_parser -======= -@pytest.fixture() -def tf2_profiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_step.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_cprofiler_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_cprofiler_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_python_pyinstrument_config_parser_by_step(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_pyinstrument_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_time(config_folder, monkeypatch): - config_path = os.path.join(config_folder, "test_tf2_profiler_config_parser_by_time.json") - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -@pytest.fixture() -def tf2_profiler_config_parser_by_step_all_params(config_folder, monkeypatch): - config_path = os.path.join( - config_folder, "test_tf2_python_profiler_all_params_config_parser_by_step.json" - ) - monkeypatch.setenv("SMPROFILER_CONFIG_PATH", config_path) - return ProfilerConfigParser() - - -def set_up_profiling(profilerconfig): - profiler_config_parser = profilerconfig ->>>>>>> update profiler for native tf training -======= def set_up_profiling(profiler_config): profiler_config_parser = profiler_config ->>>>>>> add docstring, update helper function names and improve the unit tests python_profiler = None if profiler_config_parser.profiling_enabled: config = profiler_config_parser.config @@ -246,11 +170,7 @@ def train_step(images, labels): opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) -<<<<<<< HEAD - step = 0 -======= current_step = 0 ->>>>>>> update profiler for native tf training n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: @@ -258,20 +178,9 @@ def train_step(images, labels): labels = tf.one_hot(labels, depth=10) if debugger: with hook.wrap_tape(tf.GradientTape()) as tape: -<<<<<<< HEAD -<<<<<<< HEAD - hook.profiling_start_batch() -======= ->>>>>>> update format - logits = train_step(data, labels) - if python_profiler and start_step <= step < end_step: - assert python_profiler._start_step == step -======= - hook.profiling_start_batch(mode=smd.modes.TRAIN) logits = train_step(data, labels) if python_profiler and start_step <= current_step < end_step: assert python_profiler._start_step == current_step ->>>>>>> update profiler for native tf training assert python_profiler._start_phase == StepPhase.STEP_START grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) @@ -280,25 +189,20 @@ def train_step(images, labels): hook.save_tensor("labels", labels, CollectionKeys.OUTPUTS) else: with tf.GradientTape() as tape: -<<<<<<< HEAD -<<<<<<< HEAD - hook.profiling_start_batch() -======= ->>>>>>> update format logits = train_step(data, labels) - if python_profiler and start_step <= step < end_step: - assert python_profiler._start_step == step + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step assert python_profiler._start_phase == StepPhase.STEP_START grads = tape.gradient(logits, model.variables) opt.apply_gradients(zip(grads, model.variables)) hook.profiling_end_batch() - if python_profiler and start_step <= step < end_step: - assert python_profiler._start_step == step + if python_profiler and start_step <= current_step < end_step: + assert python_profiler._start_step == current_step assert python_profiler._start_phase == StepPhase.STEP_END - step += 1 + current_step += 1 hook.profiling_end() if python_profiler: - assert python_profiler._start_step == step - 1 + assert python_profiler._start_step == current_step - 1 assert python_profiler._start_phase == StepPhase.STEP_END @@ -325,48 +229,12 @@ def verify_num_trace_events(profiler_config): """ This verifies the number of events when detailed profiling is enabled. """ -======= - hook.profiling_start_batch(mode=smd.modes.TRAIN) - logits = train_step(data, labels) - if python_profiler and start_step <= current_step < end_step: - assert python_profiler._start_step == current_step - assert python_profiler._start_phase == StepPhase.STEP_START - grads = tape.gradient(logits, model.variables) - opt.apply_gradients(zip(grads, model.variables)) - hook.profiling_end_batch(mode=smd.modes.TRAIN) - hook.profiling_end() - - -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step, out_dir): - """ - This test executes a TF2 native training script with profiler, enables detailed TF profiling by step, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_step.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - ->>>>>>> update profiler for native tf training t_events = TensorboardProfilerEvents() # get tensorboard timeline files files = [] -<<<<<<< HEAD -<<<<<<< HEAD - for path in Path(profilerconfig.config.local_path + "/framework").rglob( -======= - for path in Path(tf2_profiler_config_parser_by_step.config.local_path + "/framework").rglob( ->>>>>>> update profiler for native tf training -======= - -<<<<<<< HEAD - for path in Path(os.path.join(profilerconfig.config.local_path + "/framework")).rglob( ->>>>>>> rename the debugger native training flag and update the path join in the unit test -======= + for path in Path(os.path.join(profiler_config.config.local_path + "/framework")).rglob( ->>>>>>> add docstring, update helper function names and improve the unit tests f"*{TENSORBOARDTIMELINE_SUFFIX}" ): files.append(path) @@ -386,15 +254,6 @@ def test_native_tf2_profiler_by_step_profiler(tf2_profiler_config_parser_by_step assert num_trace_events >= 230 -<<<<<<< HEAD -<<<<<<< HEAD -def train_loop(out_dir, debugger=False): - hook = Hook(out_dir=out_dir, save_all=True) - helper_native_tf2_gradtape(hook=hook, debugger=debugger) - - -======= ->>>>>>> add docstring, update helper function names and improve the unit tests def verify_tensor_names(out_dir): """ This verifies the tensor names when debugger is enabled. @@ -422,164 +281,6 @@ def verify_timeline_file(out_dir): This verifies the creation of the timeline file according to file path specification. It reads backs the file contents to make sure it is in valid JSON format. """ -======= -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_by_time_profiler(tf2_profiler_config_parser_by_time, out_dir): - """ - This test executes a TF2 native training script with profiler, enables detailed TF profiling by time, and - verifies the number of events. - """ - assert tf2_profiler_config_parser_by_time.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - - # get tensorboard timeline files - files = [] - for path in Path(tf2_profiler_config_parser_by_time.config.local_path + "/framework").rglob( - f"*{TENSORBOARDTIMELINE_SUFFIX}" - ): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events = TensorboardProfilerEvents() - - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 700 - - -@pytest.mark.skip_if_non_eager -def test_native_python_profiling_cprofiler(out_dir, tf2_python_cprofiler_config_parser_by_step): - """ - This test executes a TF2 native training script with profiler, enables cprofiler by step, and - verifies the python profiling's steps and expected output files. - """ - assert tf2_python_cprofiler_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling( - tf2_python_cprofiler_config_parser_by_step - ) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape( - hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step - ) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - # Since python_profiler.stop_profiling for the posthookclose step automatically executed - # upon normal interpreter termination, - # the number of the files is (end_step - start_step) * 2 + 2 - 1. - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -@pytest.mark.skip_if_non_eager -def test_native_python_profiling_pyinstrument( - out_dir, tf2_python_pyinstrument_config_parser_by_step -): - """ - This test executes a TF2 native training script with profiler, enables pyinstrument by step, and - verifies the python profiling's steps and expected output files. - """ - assert tf2_python_pyinstrument_config_parser_by_step.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling( - tf2_python_pyinstrument_config_parser_by_step - ) - - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps - - profiler_name = PYINSTRUMENT_NAME - allowed_files = [PYINSTRUMENT_JSON_FILENAME, PYINSTRUMENT_HTML_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - - hook = Hook(out_dir=out_dir) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape( - hook=hook, python_profiler=python_profiler, start_step=start_step, end_step=end_step - ) - - # Test that directory and corresponding files exist. - assert os.path.isdir(python_stats_dir) - - for node_id in os.listdir(python_stats_dir): - node_dir_path = os.path.join(python_stats_dir, node_id) - stats_dirs = os.listdir(node_dir_path) - # Since python_profiler.stop_profiling for the posthookclose step automatically executed - # upon normal interpreter termination, - # the number of the files is (end_step - start_step) * 2 + 2 - 1. - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 - - for stats_dir in stats_dirs: - # Validate that the expected files are in the stats dir - stats_dir_path = os.path.join(node_dir_path, stats_dir) - stats_files = os.listdir(stats_dir_path) - assert set(stats_files) == set(allowed_files) - - # Validate the actual stats files - for stats_file in stats_files: - stats_path = os.path.join(stats_dir_path, stats_file) - if stats_file == CPROFILE_STATS_FILENAME: - assert pstats.Stats(stats_path) - elif stats_file == PYINSTRUMENT_JSON_FILENAME: - with open(stats_path, "r") as f: - assert json.load(f) - - -@pytest.mark.skip_if_non_eager -def test_create_timeline_file(simple_profiler_config_parser, out_dir): - """ - This test is to test the creation of the timeline file according to file path specification. - It reads backs the file contents to make sure it is in valid JSON format. - """ - assert simple_profiler_config_parser.profiling_enabled - - hook = Hook(out_dir=out_dir) - helper_native_tf2_gradtape(hook=hook) - ->>>>>>> update profiler for native tf training files = [] for path in Path(os.path.join(out_dir + "/" + DEFAULT_PREFIX)).rglob("*.json"): files.append(path) @@ -601,44 +302,12 @@ def test_create_timeline_file(simple_profiler_config_parser, out_dir): assert events_dict -<<<<<<< HEAD -<<<<<<< HEAD -def verify_python_profiling(profiler_name, out_dir, profilerconfig, debugger=False): -======= def verify_python_profiling(profiler_name, out_dir, num_steps): ->>>>>>> add docstring, update helper function names and improve the unit tests """ This executes a TF2 native training script with profiler or both profiler and debugger, enables python profiling by step, and verifies the python profiling's steps and expected output files. """ -<<<<<<< HEAD - assert profilerconfig.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling(profilerconfig) -======= -@pytest.mark.skip_if_non_eager -def test_native_tf2_profiler_debugger_all_params( - tf2_profiler_config_parser_by_step_all_params, out_dir -): - """ - This test executes a TF2 native training script with debugger and profiler, enables detailed TF profiling, python - profiling by step. - """ - assert tf2_profiler_config_parser_by_step_all_params.profiling_enabled - - profiler_config_parser, python_profiler = set_up_profiling( - tf2_profiler_config_parser_by_step_all_params - ) ->>>>>>> update profiler for native tf training - config = profiler_config_parser.config - start_step = config.python_profiling_config.start_step - num_steps = config.python_profiling_config.num_steps - end_step = start_step + num_steps -======= ->>>>>>> add docstring, update helper function names and improve the unit tests - -<<<<<<< HEAD if profiler_name == CPROFILE_NAME: allowed_files = [CPROFILE_STATS_FILENAME] @@ -647,31 +316,16 @@ def test_native_tf2_profiler_debugger_all_params( python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - # Test that directory and corresponding files exist. -======= - profiler_name = CPROFILE_NAME - allowed_files = [CPROFILE_STATS_FILENAME] - python_stats_dir = os.path.join(out_dir, "framework/", "tensorflow/", profiler_name) - - hook = Hook(out_dir=out_dir, save_all=True) - hook.python_profiler = python_profiler - helper_native_tf2_gradtape(hook=hook, debugger=True) - - # Verifying python profiling related files. ->>>>>>> update profiler for native tf training assert os.path.isdir(python_stats_dir) for node_id in os.listdir(python_stats_dir): node_dir_path = os.path.join(python_stats_dir, node_id) stats_dirs = os.listdir(node_dir_path) -<<<<<<< HEAD + # Since python_profiler.stop_profiling for the posthookclose step automatically executed # upon normal interpreter termination, # the number of the files is num_steps * 2 + 2 - 1. assert len(stats_dirs) == num_steps * 2 + 1 -======= - assert len(stats_dirs) == (end_step - start_step) * 2 + 1 ->>>>>>> update profiler for native tf training for stats_dir in stats_dirs: # Validate that the expected files are in the stats dir @@ -688,7 +342,6 @@ def test_native_tf2_profiler_debugger_all_params( with open(stats_path, "r") as f: assert json.load(f) -<<<<<<< HEAD @pytest.mark.skip_if_non_eager @pytest.mark.parametrize("enable_detailed_profiling", [False, True]) @@ -800,66 +453,3 @@ def test_native_tf2_profiling_debugger( verify_tensor_names(out_dir) else: pass -======= - # Verifying detailed TF profiling. - t_events = TensorboardProfilerEvents() - - # get tensorboard timeline files - files = [] - for path in Path( - tf2_profiler_config_parser_by_step_all_params.config.local_path + "/framework" - ).rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"): - files.append(path) - - assert len(files) == 1 - - trace_file = str(files[0]) - t_events.read_events_from_file(trace_file) - - all_trace_events = t_events.get_all_events() - num_trace_events = len(all_trace_events) - - print(f"Number of events read = {num_trace_events}") - - # The number of events is varying by a small number on - # consecutive runs. Hence, the approximation in the below asserts. - assert num_trace_events >= 230 - - # Verifying timeline files. - files = [] - for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): - files.append(path) - - assert len(files) == 1 - - file_ts = files[0].name.split("_")[0] - folder_name = files[0].parent.name - assert folder_name == time.strftime( - TRACE_DIRECTORY_FORMAT, time.gmtime(int(file_ts) / CONVERT_TO_MICROSECS) - ) - assert folder_name == datetime.strptime(folder_name, TRACE_DIRECTORY_FORMAT).strftime( - TRACE_DIRECTORY_FORMAT - ) - - with open(files[0]) as timeline_file: - events_dict = json.load(timeline_file) - - assert events_dict - - # Verifying tensor names. - trial = smd.create_trial(out_dir) - assert len(trial.steps()) > 0, "Nothing saved at any step." - assert len(trial.tensor_names()) > 0, "Tensors were not saved." - assert trial.tensor_names(collection=CollectionKeys.LOSSES) == ["loss"] - assert len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0 - assert len(trial.tensor_names(collection=CollectionKeys.BIASES)) > 0 - assert trial.tensor_names(collection="optimizer_variables") == [ - "Adam/beta_1:0", - "Adam/beta_2:0", - "Adam/decay:0", - "Adam/iter:0", - "Adam/learning_rate:0", - ] - assert trial.tensor_names(collection=CollectionKeys.INPUTS) == ["inputs"] - assert trial.tensor_names(collection=CollectionKeys.OUTPUTS) == ["labels", "logits"] ->>>>>>> update profiler for native tf training