Skip to content

Commit 4604a30

Browse files
authored
Merge pull request #10 from awslabs/metrics_histogram_fixes
Refactoring metrics histogram and step histogram
2 parents a472820 + 376fdfd commit 4604a30

File tree

6 files changed

+280
-176
lines changed

6 files changed

+280
-176
lines changed

examples/profiler/profiler_generic_dashboard.ipynb

Lines changed: 85 additions & 23 deletions
Large diffs are not rendered by default.
Binary file not shown.

smdebug/profiler/algorithm_metrics_reader.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ def __init__(self, s3_trial_path, use_in_memory_cache=False):
171171
self.bucket_name = bucket_name
172172
self.base_folder = base_folder
173173
self.prefix = os.path.join(self.base_folder, self.prefix, "")
174+
self.logger.info(
175+
f"S3AlgorithmMetricsReader created with bucket:{bucket_name} and prefix:{self.prefix}"
176+
)
174177
# Pre-build the file list so that user can query get_timestamp_of_latest_available_file() and get_current_time_range_for_event_query
175178
self.refresh_event_file_list()
176179

@@ -202,9 +205,9 @@ def parse_event_files(self, event_files):
202205
"""
203206

204207
def refresh_event_file_list(self):
205-
list_dir = ListRequest(
206-
Bucket=self.bucket_name,
207-
Prefix=self.prefix,
208-
StartAfter=self._startAfter_prefix if self._startAfter_prefix else self.prefix,
208+
start_after = self._startAfter_prefix if self._startAfter_prefix else self.prefix
209+
self.logger.debug(
210+
f"Making listreq with bucket:{self.bucket_name} prefix:{self.prefix} startAfter:{start_after}"
209211
)
212+
list_dir = ListRequest(Bucket=self.bucket_name, Prefix=self.prefix, StartAfter=start_after)
210213
self._refresh_event_file_list_s3_mode(list_dir)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Local
2+
from .metrics_histogram import MetricsHistogram
3+
from .step_histogram import StepHistogram
4+
from .training_job import TrainingJob

smdebug/profiler/analysis/notebook_utils/metrics_histogram.py

Lines changed: 84 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -12,71 +12,96 @@
1212

1313

1414
class MetricsHistogram:
15-
def __init__(self, metrics_reader, select_metrics=None):
15+
def __init__(self, metrics_reader):
1616

1717
self.metrics_reader = metrics_reader
1818

1919
# get timestamp of latest files
2020
self.last_timestamp = self.metrics_reader.get_timestamp_of_latest_available_file()
21-
self.all_events = self.metrics_reader.get_events(0, self.last_timestamp)
22-
23-
# define the list of metrics to plot: per default cpu and gpu
24-
self.select_metrics = ["cpu", "gpu"]
25-
if select_metrics is not None:
26-
self.select_metrics.extend(select_metrics)
27-
28-
self.preprocess_system_metrics()
29-
30-
self.create_plot()
31-
32-
def preprocess_system_metrics(self):
21+
self.seen_system_metric_list = set()
22+
self.select_metrics = []
23+
self.sources = {}
24+
self.target = None
25+
26+
"""
27+
@param starttime is starttime_since_epoch_in_micros. Default value 0, which means start
28+
@param endtime is endtime_since_epoch_in_micros. Default value is MetricsHistogram.last_timestamp , i.e., last_timestamp seen by system_metrics_reader
29+
@param select_metrics is array of metrics to be selected, Default ["cpu", "gpu"]
30+
"""
31+
32+
def plot(self, starttime=0, endtime=None, select_metrics=[".*"]):
33+
if endtime == None:
34+
endtime = self.metrics_reader.get_timestamp_of_latest_available_file()
35+
all_events = self.metrics_reader.get_events(starttime, endtime)
36+
print(
37+
f"Found {len(all_events)} system metrics events from timestamp_in_us:{starttime} to timestamp_in_us:{endtime}"
38+
)
39+
self.last_timestamp = endtime
40+
self.select_metrics = select_metrics
41+
self.system_metrics = self.preprocess_system_metrics(all_events=all_events)
42+
self.create_plot(self.system_metrics)
43+
44+
def clear():
45+
self.system_metrics = {}
46+
self.sources = {}
3347

48+
def preprocess_system_metrics(self, all_events=[], system_metrics={}):
49+
cpu_name = None
3450
# read all available system metric events and store them in dict
35-
self.system_metrics = {}
36-
for event in self.all_events:
37-
if (
38-
event.name not in self.system_metrics
39-
and event.dimension is not "GPUMemoryUtilization"
40-
):
41-
self.system_metrics[event.name] = []
42-
self.system_metrics[event.name].append(event.value)
51+
for event in all_events:
52+
if event.name not in system_metrics:
53+
system_metrics[event.name] = []
54+
if cpu_name is None and event.dimension == "CPUUtilization":
55+
cpu_name = event.name
56+
print(cpu_name)
57+
system_metrics[event.name].append(event.value)
4358

4459
# total cpu utilization is not recorded in SM
45-
self.cores = 0.0
46-
cpu_total = np.zeros(len(self.system_metrics["cpu0"]))
47-
for metric in self.system_metrics:
48-
if "cpu" in metric:
49-
self.cores += 1
50-
cpu_total += self.system_metrics[metric]
60+
if cpu_name is not None:
61+
self.cores = 0.0
62+
cpu_total = np.zeros(len(system_metrics[cpu_name]))
63+
for metric in system_metrics:
64+
# TODO should we do similar for gpu too
65+
if "cpu" in metric and metric:
66+
if metric not in self.seen_system_metric_list:
67+
self.cores += 1
68+
self.seen_system_metric_list.add(metric)
5169

52-
self.system_metrics["cpu_total"] = cpu_total / self.cores
70+
cpu_total += system_metrics[metric]
5371

54-
# number of datapoints
55-
self.width = self.system_metrics["cpu_total"].shape[0]
72+
system_metrics["cpu_total"] = cpu_total / self.cores
5673

5774
# add user defined metrics to the list
58-
self.metrics = []
59-
available_metrics = list(self.system_metrics.keys())
75+
filtered_metrics = []
76+
available_metrics = list(system_metrics.keys())
77+
print(f"select metrics:{self.select_metrics}")
6078

6179
for metric in self.select_metrics:
62-
r = re.compile(".*" + metric)
63-
self.metrics.extend(list(filter(r.match, available_metrics)))
64-
65-
def create_plot(self):
66-
80+
r = re.compile(r".*" + metric + r".*")
81+
filtered_metrics.extend(list(filter(r.search, available_metrics)))
82+
print(f"filtered_metrics:{filtered_metrics}")
83+
84+
# delete the keys which needs to be filtered out
85+
for key in available_metrics:
86+
if key not in filtered_metrics and "total" not in key:
87+
del system_metrics[key]
88+
return system_metrics
89+
90+
def _get_probs_binedges(self, values):
91+
# create histogram bins
92+
bins = np.arange(0, 100, 2)
93+
probs, binedges = np.histogram(values, bins=bins)
94+
bincenters = 0.5 * (binedges[1:] + binedges[:-1])
95+
return probs, binedges
96+
97+
def create_plot(self, system_metrics={}):
98+
metrics = list(system_metrics.keys())
6799
figures = []
68-
self.sources = {}
69100

70101
# create a histogram per metric
71-
for index, metric in enumerate(self.metrics):
102+
for index, metric in enumerate(metrics):
72103
p = figure(plot_height=250, plot_width=250)
73-
values = self.system_metrics[metric]
74-
75-
# create histogram bins
76-
bins = np.arange(0, 100, 2)
77-
probs, binedges = np.histogram(values, bins=bins)
78-
bincenters = 0.5 * (binedges[1:] + binedges[:-1])
79-
104+
probs, binedges = self._get_probs_binedges(system_metrics[metric])
80105
# set data
81106
source = ColumnDataSource(data=dict(top=probs, left=binedges[:-1], right=binedges[1:]))
82107
self.sources[metric] = source
@@ -93,47 +118,29 @@ def create_plot(self):
93118

94119
# set plot
95120
p.y_range.start = 0
96-
p.xaxis.axis_label = metric
121+
p.xaxis.axis_label = metric + " util"
122+
p.yaxis.axis_label = "Occurences"
97123
p.grid.grid_line_color = "white"
98124
figures.append(p)
99125

100126
p = gridplot(figures, ncols=4)
101127
self.target = show(p, notebook_handle=True)
102128

103129
def update_data(self, current_timestamp):
104-
105130
# get all events from last to current timestamp
106131
events = self.metrics_reader.get_events(self.last_timestamp, current_timestamp)
107132
self.last_timestamp = current_timestamp
108133

109-
if len(events) > 0:
110-
for event in events:
111-
if event.name != None:
112-
self.system_metrics[event.name].append(event.value)
113-
114-
cpu_total = np.zeros(len(self.system_metrics["cpu0"]))
115-
116-
# iterate over available metrics
117-
for metric in self.system_metrics:
134+
self.system_metrics = self.preprocess_system_metrics(events, self.system_metrics)
118135

119-
# compute total cpu utilization
120-
if "cpu" in metric and metric != "cpu_total":
121-
cpu_total += self.system_metrics[metric]
122-
123-
self.system_metrics["cpu_total"] = cpu_total / self.cores
124-
125-
# update histograms
126-
for index, metric in enumerate(self.metrics):
127-
values = self.system_metrics[metric]
128-
129-
# create new histogram bins
130-
bins = np.arange(0, 100, 2)
131-
probs, binedges = np.histogram(values, bins=bins)
132-
bincenters = 0.5 * (binedges[1:] + binedges[:-1])
133-
134-
# update data
135-
self.sources[metric].data["top"] = probs
136-
self.sources[metric].data["left"] = binedges[:-1]
137-
self.sources[metric].data["right"] = binedges[1:]
136+
# update histograms
137+
for index, metric in enumerate(self.system_metrics):
138+
values = self.system_metrics[metric]
138139

139-
push_notebook()
140+
# create new histogram bins
141+
probs, binedges = self._get_probs_binedges(self.system_metrics[metric])
142+
# update data
143+
self.sources[metric].data["top"] = probs
144+
self.sources[metric].data["left"] = binedges[:-1]
145+
self.sources[metric].data["right"] = binedges[1:]
146+
push_notebook()

0 commit comments

Comments
 (0)