Skip to content

Commit 0fbfead

Browse files
Updating analysis utils (#63)
* Modify step stats util to compute stats for multiproc data * Modify utils to handle multi-node data * Modify notebook utils to handle multi-node data Co-authored-by: Neelesh Dodda <[email protected]>
1 parent b71fe99 commit 0fbfead

File tree

6 files changed

+303
-152
lines changed

6 files changed

+303
-152
lines changed

smdebug/profiler/analysis/notebook_utils/heatmap.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@ def __init__(
2323
select_dimensions=[".*CPU", ".*GPU", ".*Memory"],
2424
select_events=[".*"],
2525
plot_height=350,
26+
show_workers=True,
2627
):
2728

2829
self.select_dimensions = select_dimensions
2930
self.select_events = select_events
31+
self.show_workers = show_workers
3032
self.metrics_reader = metrics_reader
3133
self.available_dimensions = []
3234
self.available_events = []
@@ -54,13 +56,17 @@ def preprocess_system_metrics(self, events, system_metrics):
5456

5557
# read all available system metric events and store them in dict
5658
for event in events:
57-
if event.dimension not in system_metrics:
58-
system_metrics[event.dimension] = {}
59-
self.available_dimensions.append(event.dimension)
60-
if event.name not in system_metrics[event.dimension]:
61-
system_metrics[event.dimension][event.name] = []
59+
if self.show_workers is True:
60+
event_unique_id = f"{event.dimension}-nodeid:{str(event.node_id)}"
61+
else:
62+
event_unique_id = event.dimension
63+
if event_unique_id not in system_metrics:
64+
system_metrics[event_unique_id] = {}
65+
self.available_dimensions.append(event_unique_id)
66+
if event.name not in system_metrics[event_unique_id]:
67+
system_metrics[event_unique_id][event.name] = []
6268
self.available_events.append(event.name)
63-
system_metrics[event.dimension][event.name].append([event.timestamp, event.value])
69+
system_metrics[event_unique_id][event.name].append([event.timestamp, event.value])
6470

6571
for dimension in system_metrics:
6672
for event in system_metrics[dimension]:
@@ -104,7 +110,14 @@ def create_plot(self):
104110
yaxis = {}
105111

106112
# number of datapoints
107-
self.width = self.system_metrics["CPUUtilization"]["total"].shape[0]
113+
max_width = 0
114+
for key in self.system_metrics.keys():
115+
if key.startswith("CPUUtilization"):
116+
width = self.system_metrics[key]["total"].shape[0]
117+
if width >= max_width:
118+
max_width = width
119+
120+
self.width = max_width
108121

109122
for dimension in self.filtered_dimensions:
110123
for event in self.filtered_events:
@@ -183,7 +196,14 @@ def update_data(self, current_timestamp):
183196
new_system_metrics[dimension][event],
184197
]
185198
)
186-
self.width = self.system_metrics["CPUUtilization"]["cpu0"].shape[0]
199+
max_width = 0
200+
for key in self.system_metrics.keys():
201+
if key.startswith("CPUUtilization"):
202+
width = self.system_metrics[key]["cpu0"].shape[0]
203+
if width >= max_width:
204+
max_width = width
205+
206+
self.width = max_width
187207

188208
tmp = []
189209
metric_names = []

smdebug/profiler/analysis/notebook_utils/metrics_histogram.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,14 @@ def __init__(self, metrics_reader):
2929
@param select_metrics is array of metrics to be selected, Default ["cpu", "gpu"]
3030
"""
3131

32-
def plot(self, starttime=0, endtime=None, select_dimensions=[".*"], select_events=[".*"]):
32+
def plot(
33+
self,
34+
starttime=0,
35+
endtime=None,
36+
select_dimensions=[".*"],
37+
select_events=[".*"],
38+
show_workers=True,
39+
):
3340
if endtime == None:
3441
endtime = self.metrics_reader.get_timestamp_of_latest_available_file()
3542
all_events = self.metrics_reader.get_events(starttime, endtime)
@@ -39,6 +46,7 @@ def plot(self, starttime=0, endtime=None, select_dimensions=[".*"], select_event
3946
self.last_timestamp = endtime
4047
self.select_dimensions = select_dimensions
4148
self.select_events = select_events
49+
self.show_workers = show_workers
4250
self.system_metrics = self.preprocess_system_metrics(
4351
all_events=all_events, system_metrics={}
4452
)
@@ -52,13 +60,17 @@ def preprocess_system_metrics(self, all_events=[], system_metrics={}):
5260

5361
# read all available system metric events and store them in dict
5462
for event in all_events:
55-
if event.dimension not in system_metrics:
56-
system_metrics[event.dimension] = {}
57-
self.available_dimensions.append(event.dimension)
58-
if event.name not in system_metrics[event.dimension]:
59-
system_metrics[event.dimension][event.name] = []
63+
if self.show_workers is True:
64+
event_unique_id = f"{event.dimension}-nodeid:{str(event.node_id)}"
65+
else:
66+
event_unique_id = event.dimension
67+
if event_unique_id not in system_metrics:
68+
system_metrics[event_unique_id] = {}
69+
self.available_dimensions.append(event_unique_id)
70+
if event.name not in system_metrics[event_unique_id]:
71+
system_metrics[event_unique_id][event.name] = []
6072
self.available_events.append(event.name)
61-
system_metrics[event.dimension][event.name].append(event.value)
73+
system_metrics[event_unique_id][event.name].append(event.value)
6274

6375
# compute total utilization per event dimension
6476
for event_dimension in system_metrics:

smdebug/profiler/analysis/notebook_utils/timeline_charts.py

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,12 @@ def __init__(
2525
select_dimensions=[".*"],
2626
select_events=[".*"],
2727
x=1000,
28+
show_workers=True,
2829
):
2930

3031
self.select_dimensions = select_dimensions
3132
self.select_events = select_events
33+
self.show_workers = show_workers
3234

3335
# placeholder
3436
self.sources = {}
@@ -52,10 +54,16 @@ def __init__(
5254
self.start = 0 # replace with system_metrics_reader.get_first_available_timestamp()/1000000
5355
self.system_metrics = self.preprocess_system_metrics(events, system_metrics={})
5456

55-
if x < self.system_metrics["CPUUtilization"]["total"].shape[0]:
57+
min_width = float("inf")
58+
for key in self.system_metrics.keys():
59+
if key.startswith("CPUUtilization"):
60+
width = self.system_metrics[key]["total"].shape[0]
61+
if width <= min_width:
62+
min_width = width
63+
if x < min_width:
5664
self.width = x
5765
else:
58-
self.width = self.system_metrics["CPUUtilization"]["total"].shape[0] - 1
66+
self.width = min_width - 1
5967

6068
# create plot
6169
self.create_plot()
@@ -64,13 +72,17 @@ def preprocess_system_metrics(self, events, system_metrics):
6472

6573
# read all available system metric events and store them in dict
6674
for event in events:
67-
if event.dimension not in system_metrics:
68-
system_metrics[event.dimension] = {}
69-
self.available_dimensions.append(event.dimension)
70-
if event.name not in system_metrics[event.dimension]:
71-
system_metrics[event.dimension][event.name] = []
75+
if self.show_workers is True:
76+
event_unique_id = f"{event.dimension}-nodeid:{str(event.node_id)}"
77+
else:
78+
event_unique_id = event.dimension
79+
if event_unique_id not in system_metrics:
80+
system_metrics[event_unique_id] = {}
81+
self.available_dimensions.append(event_unique_id)
82+
if event.name not in system_metrics[event_unique_id]:
83+
system_metrics[event_unique_id][event.name] = []
7284
self.available_events.append(event.name)
73-
system_metrics[event.dimension][event.name].append([event.timestamp, event.value])
85+
system_metrics[event_unique_id][event.name].append([event.timestamp, event.value])
7486

7587
for dimension in system_metrics:
7688
for event in system_metrics[dimension]:
@@ -200,8 +212,15 @@ def create_plot(self):
200212
def find_time_annotations(self, indexes):
201213

202214
if len(indexes) > 0:
203-
begin_timestamp = self.system_metrics["CPUUtilization"]["total"][np.min(indexes), 0]
204-
end_timestamp = self.system_metrics["CPUUtilization"]["total"][np.max(indexes), 0]
215+
cpu_util = None
216+
for key in self.system_metrics.keys():
217+
if key.startswith("CPUUtilization"):
218+
width = self.system_metrics[key]["total"].shape[0]
219+
if cpu_util is None or np.min(indexes) <= width <= np.max(indexes):
220+
cpu_util = self.system_metrics[key]
221+
222+
begin_timestamp = cpu_util["total"][np.min(indexes), 0]
223+
end_timestamp = cpu_util["total"][np.max(indexes), 0]
205224
total_time = end_timestamp - begin_timestamp
206225
print(
207226
f"Selected timerange: {begin_timestamp + self.start} to {end_timestamp + self.start}"
@@ -281,8 +300,15 @@ def plot_dataloaders(self, events, begin_timestamp, end_timestamp):
281300
def plot_detailed_profiler_data(self, indexes):
282301

283302
if len(indexes) > 0:
284-
begin_timestamp = self.system_metrics["CPUUtilization"]["cpu0"][np.min(indexes), 0]
285-
end_timestamp = self.system_metrics["CPUUtilization"]["cpu0"][np.max(indexes), 0]
303+
cpu_util = None
304+
for key in self.system_metrics.keys():
305+
if key.startswith("CPUUtilization"):
306+
width = self.system_metrics[key]["cpu0"].shape[0]
307+
if cpu_util is None or np.min(indexes) <= width <= np.max(indexes):
308+
cpu_util = self.system_metrics[key]
309+
310+
begin_timestamp = cpu_util["cpu0"][np.min(indexes), 0]
311+
end_timestamp = cpu_util["cpu0"][np.max(indexes), 0]
286312
print(
287313
f"Selected timerange: {begin_timestamp + self.start} to {end_timestamp + self.start}"
288314
)
@@ -408,13 +434,22 @@ def update_data(self, current_timestamp):
408434
event
409435
][self.system_metrics[dimension][event][:, 0].argsort()]
410436

411-
self.width = self.system_metrics["CPUUtilization"]["total"].shape[0] - 1
437+
max_width = 0
438+
cpu_util = None
439+
for key in self.system_metrics.keys():
440+
if key.startswith("CPUUtilization"):
441+
width = self.system_metrics[key]["total"].shape[0]
442+
if cpu_util is None or width >= max_width:
443+
max_width = width
444+
cpu_util = self.system_metrics[key]
445+
446+
self.width = max_width - 1
412447

413448
if self.width > 1000:
414-
min_value = self.system_metrics["CPUUtilization"]["total"][-1000, 0]
449+
min_value = cpu_util["total"][-1000, 0]
415450
else:
416-
min_value = self.system_metrics["CPUUtilization"]["total"][-self.width, 0]
417-
max_value = self.system_metrics["CPUUtilization"]["total"][-1, 0]
451+
min_value = cpu_util["total"][-self.width, 0]
452+
max_value = cpu_util["total"][-1, 0]
418453

419454
for figure in self.figures:
420455
figure.x_range.start = int(min_value)

0 commit comments

Comments
 (0)