Skip to content

Commit e94a40c

Browse files
authored
Bugfixes in analysis and notebooks (#49)
1 parent 4a67447 commit e94a40c

10 files changed

+103
-47
lines changed

examples/profiler/analyze_performance_bottlenecks.ipynb

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
],
4747
"source": [
4848
"\n",
49-
"! pip install -q sdk/smdebug-0.9.2b20200807-py3-none-any.whl"
49+
"! pip install -q ../sdk/smdebug-0.9.2b20200810-py2.py3-none-any.whl"
5050
]
5151
},
5252
{
@@ -1478,7 +1478,7 @@
14781478
"source": [
14791479
"import matplotlib.pyplot as plt\n",
14801480
"\n",
1481-
"plt.pie(system_usage.values(), autopct='%1.1f%%', labels=system_usage.keys())\n",
1481+
"plt.pie(system_usage.values(), autopct='%1.1f%%', labels=system_usage.keys(), labeldistance=1.3)\n",
14821482
"plt.show()"
14831483
]
14841484
},
@@ -1971,8 +1971,11 @@
19711971
"source": [
19721972
"import pandas as pd\n",
19731973
"\n",
1974-
"dataloaders['start_time'] = pd.to_datetime(dataloaders['start_time'], format='%Y-%m-%dT%H:%M:%S:%f')\n",
1975-
"dataloaders = dataloaders.set_index(['start_time'])"
1974+
"if dataloaders.shape[0] > 0:\n",
1975+
" dataloaders['start_time'] = pd.to_datetime(dataloaders['start_time'], format='%Y-%m-%dT%H:%M:%S:%f')\n",
1976+
" dataloaders = dataloaders.set_index(['start_time'])\n",
1977+
"else:\n",
1978+
" print('No profiling information for dataloaders available')"
19761979
]
19771980
},
19781981
{
@@ -1988,7 +1991,10 @@
19881991
"metadata": {},
19891992
"outputs": [],
19901993
"source": [
1991-
"active_threads_per_second = dataloaders['tid'].groupby([pd.Grouper(freq='1S')]).count()"
1994+
"if dataloaders.shape[0] > 0:\n",
1995+
" active_threads_per_second = dataloaders['tid'].groupby([pd.Grouper(freq='1S')]).count()\n",
1996+
"else:\n",
1997+
" print('No profiling information for dataloaders available')"
19921998
]
19931999
},
19942000
{
@@ -2020,7 +2026,10 @@
20202026
}
20212027
],
20222028
"source": [
2023-
"plt.plot(active_threads_per_second[:500])"
2029+
"if dataloaders.shape[0] > 0:\n",
2030+
" plt.plot(active_threads_per_second[:500])\n",
2031+
"else:\n",
2032+
" print('No profiling information for dataloaders available')"
20242033
]
20252034
},
20262035
{
@@ -2036,7 +2045,10 @@
20362045
"metadata": {},
20372046
"outputs": [],
20382047
"source": [
2039-
"unique_active_threads_per_second = dataloaders.groupby([pd.Grouper(freq='1S'), pd.Grouper('tid'), pd.Grouper('pid')]).agg(['count'])"
2048+
"if dataloaders.shape[0] > 0:\n",
2049+
" unique_active_threads_per_second = dataloaders.groupby([pd.Grouper(freq='1S'), pd.Grouper('tid'), pd.Grouper('pid')]).agg(['count'])\n",
2050+
"else:\n",
2051+
" print('No profiling information for dataloaders available')"
20402052
]
20412053
},
20422054
{
@@ -2070,9 +2082,13 @@
20702082
}
20712083
],
20722084
"source": [
2073-
"max_threads = unique_active_threads_per_second.groupby('start_time').count().max()[0]\n",
2074-
"if max_threads < n_cpus:\n",
2075-
" print(\"Available cores:\", n_cpus, \"Maximum number of threads seen per time aggregation\", max_threads)"
2085+
"if dataloaders.shape[0] > 0:\n",
2086+
" max_threads = unique_active_threads_per_second.groupby('start_time').count().max()[0]\n",
2087+
" if max_threads < n_cpus:\n",
2088+
" print(\"Available cores:\", n_cpus, \"Maximum number of threads seen per time aggregation\", max_threads)\n",
2089+
" \n",
2090+
"else:\n",
2091+
" print('No profiling information for dataloaders available')"
20762092
]
20772093
},
20782094
{
@@ -2250,9 +2266,9 @@
22502266
}
22512267
],
22522268
"source": [
2253-
"plt.pie(framework_metrics.values(), autopct='%1.1f%%', labels=framework_metrics.keys())\n",
2269+
"plt.pie(framework_metrics.values(), autopct='%1.1f%%', labels=framework_metrics.keys(), labeldistance=1.3)\n",
22542270
"plt.show()\n",
2255-
"plt.pie(training_phase.values(), autopct='%1.1f%%', labels=training_phase.keys())\n",
2271+
"plt.pie(training_phase.values(), autopct='%1.1f%%', labels=training_phase.keys(), labeldistance=1.3)\n",
22562272
"plt.show()"
22572273
]
22582274
},
@@ -2309,9 +2325,9 @@
23092325
}
23102326
],
23112327
"source": [
2312-
"plt.pie(framework_metrics.values(), autopct='%1.1f%%', labels=framework_metrics.keys())\n",
2328+
"plt.pie(framework_metrics.values(), autopct='%1.1f%%', labels=framework_metrics.keys(), labeldistance=1.3)\n",
23132329
"plt.show()\n",
2314-
"plt.pie(training_phase.values(), autopct='%1.1f%%', labels=training_phase.keys())\n",
2330+
"plt.pie(training_phase.values(), autopct='%1.1f%%', labels=training_phase.keys(), labeldistance=1.3)\n",
23152331
"plt.show()"
23162332
]
23172333
},
@@ -2356,7 +2372,7 @@
23562372
}
23572373
],
23582374
"source": [
2359-
"plt.pie(results['outlier'].values(), autopct='%1.1f%%', labels=results['outlier'].keys())\n",
2375+
"plt.pie(results['outlier'].values(), autopct='%1.1f%%', labels=results['outlier'].keys(), labeldistance=1.3)\n",
23602376
"plt.show()"
23612377
]
23622378
},
@@ -2394,7 +2410,7 @@
23942410
}
23952411
],
23962412
"source": [
2397-
"plt.pie(results['normal'].values(), autopct='%1.1f%%', labels=results['normal'].keys())\n",
2413+
"plt.pie(results['normal'].values(), autopct='%1.1f%%', labels=results['normal'].keys(), labeldistance=1.3)\n",
23982414
"plt.show()"
23992415
]
24002416
},

examples/profiler/aws_sagemaker_profiler_example_pytorch.ipynb

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"outputs": [],
2424
"source": [
2525
"! pip install ../sdk/sagemaker-1.60.3.dev0.tar.gz -q\n",
26-
"! pip install ../sdk/smdebug-0.9.2b20200807-py3-none-any.whl\n",
26+
"! pip install ../sdk/smdebug-0.9.2b20200810-py2.py3-none-any.whl\n",
2727
"\n",
2828
"# The following command will enable the SDK to use new profiler configs in the API\n",
2929
"! aws configure add-model --service-model file://../sdk/sagemaker-2017-07-24.normal.json --service-name sagemaker"
@@ -75,7 +75,29 @@
7575
"cell_type": "markdown",
7676
"metadata": {},
7777
"source": [
78-
"Define an Estimator"
78+
"### Set region where this notebook is running"
79+
]
80+
},
81+
{
82+
"cell_type": "code",
83+
"execution_count": null,
84+
"metadata": {},
85+
"outputs": [],
86+
"source": [
87+
"import boto3\n",
88+
"\n",
89+
"session = boto3.session.Session()\n",
90+
"region = session.region_name\n",
91+
"\n",
92+
"image_name = f'385479125792.dkr.ecr.{region}.amazonaws.com/profiler-gpu:pt_tag1'\n",
93+
"print(f\"image being used is {image_name}\")"
94+
]
95+
},
96+
{
97+
"cell_type": "markdown",
98+
"metadata": {},
99+
"source": [
100+
"### Define PyTorch estimator"
79101
]
80102
},
81103
{
@@ -89,6 +111,7 @@
89111
"\n",
90112
"estimator = PyTorch(\n",
91113
" role=sagemaker.get_execution_role(),\n",
114+
" image_name=image_name,\n",
92115
" train_instance_count=1,\n",
93116
" train_instance_type='ml.p3.8xlarge',\n",
94117
" source_dir='demo',\n",
@@ -155,7 +178,7 @@
155178
"name": "python",
156179
"nbconvert_exporter": "python",
157180
"pygments_lexer": "ipython3",
158-
"version": "3.7.3"
181+
"version": "3.7.7"
159182
}
160183
},
161184
"nbformat": 4,

examples/profiler/aws_sagemaker_profiler_example_tensorflow.ipynb

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"outputs": [],
2424
"source": [
2525
"! pip install ../sdk/sagemaker-1.60.3.dev0.tar.gz -q\n",
26-
"! pip install ../sdk/smdebug-0.9.2b20200807-py3-none-any.whl\n",
26+
"! pip install ../sdk/smdebug-0.9.2b20200810-py2.py3-none-any.whl\n",
2727
"\n",
2828
"# The following command will enable the SDK to use new profiler configs in the API\n",
2929
"! aws configure add-model --service-model file://../sdk/sagemaker-2017-07-24.normal.json --service-name sagemaker"
@@ -76,12 +76,22 @@
7676
"metadata": {},
7777
"outputs": [],
7878
"source": [
79-
"import os\n",
80-
"region = os.environ['AWS_REGION'] # Set it to the region like us-east-1, us-east-2 if AWS_REGION is not set\n",
79+
"import boto3\n",
80+
"\n",
81+
"session = boto3.session.Session()\n",
82+
"region = session.region_name\n",
83+
"\n",
8184
"image_name = f'385479125792.dkr.ecr.{region}.amazonaws.com/profiler-gpu:latest'\n",
8285
"print(f\"image being used is {image_name}\")\n"
8386
]
8487
},
88+
{
89+
"cell_type": "markdown",
90+
"metadata": {},
91+
"source": [
92+
"### Define TensorFlow estimator"
93+
]
94+
},
8595
{
8696
"cell_type": "code",
8797
"execution_count": null,
@@ -98,7 +108,7 @@
98108
" image_name=image_name,\n",
99109
" train_instance_count=1,\n",
100110
" train_instance_type='ml.p3.8xlarge',\n",
101-
" entry_point='train.py',\n",
111+
" entry_point='train_tf.py',\n",
102112
" source_dir='demo',\n",
103113
" framework_version='2.2.0',\n",
104114
" py_version='py37',\n",
@@ -171,7 +181,7 @@
171181
" image_name=image_name,\n",
172182
" train_instance_count=1,\n",
173183
" train_instance_type='ml.p3.8xlarge',\n",
174-
" entry_point='train.py',\n",
184+
" entry_point='train_tf.py',\n",
175185
" source_dir='demo',\n",
176186
" framework_version='2.2.0',\n",
177187
" py_version='py37',\n",
@@ -241,7 +251,7 @@
241251
"name": "python",
242252
"nbconvert_exporter": "python",
243253
"pygments_lexer": "ipython3",
244-
"version": "3.7.3"
254+
"version": "3.7.7"
245255
}
246256
},
247257
"nbformat": 4,

examples/profiler/profiler_generic_dashboard.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
"! which pip\n",
4040
"! pip --version\n",
4141
"! pip uninstall smdebug --yes\n",
42-
"! pip install ./sdk/smdebug-0.9.0b20200804-py2.py3-none-any.whl\n"
42+
"! pip install ../sdk/smdebug-0.9.2b20200810-py2.py3-none-any.whl\n"
4343
]
4444
},
4545
{
Binary file not shown.
-227 KB
Binary file not shown.

smdebug/profiler/analysis/notebook_utils/timeline_charts.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -247,10 +247,10 @@ def plot_framework_events(self, events, begin_timestamp, end_timestamp):
247247
yaxis[event.event_phase],
248248
]
249249
)
250-
if index > 1000:
250+
if index > 500:
251251
print(
252-
"""Reached more than 1000 datapoints.
253-
Will only plot first 1000 datapoints for the given timerange"""
252+
"""Reached more than 500 datapoints.
253+
Will only plot first 500 datapoints for the given timerange"""
254254
)
255255
break
256256
return framework_events
@@ -272,8 +272,8 @@ def plot_dataloaders(self, events, begin_timestamp, end_timestamp):
272272
dataloaders[event.event_name].append(
273273
[int(event.start_time / 1000.0), int(event.end_time / 1000.0), tids[event.tid]]
274274
)
275-
if index > 1000:
276-
print("Reached more than 1000 datapoints. Will stop plotting.")
275+
if index > 500:
276+
print("Reached more than 500 datapoints. Will stop plotting.")
277277
break
278278

279279
return dataloaders

smdebug/profiler/analysis/utils/pandas_data_analysis.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,13 @@ def get_job_statistics(self):
5353
["Step:ModeKeys.TRAIN", "Step:ModeKeys.GLOBAL"]
5454
)
5555
)
56-
]
57-
job_statistics["training_loop_start"] = step_0["start_time"][step_0.index[0]]
56+
].reset_index(drop=True)
57+
job_statistics["training_loop_start"] = step_0["start_time"][0]
5858
job_statistics["training_loop_end"] = max(self.framework_metrics_df["end_time"])
5959
job_statistics["training_loop_duration"] = (
6060
max(self.framework_metrics_df["end_time_us"]) - step_0["start_time_us"]
6161
) / 1000
62-
job_statistics["initialization"] = step_0["start_time_us"][step_0.index[0]] / 1000
62+
job_statistics["initialization"] = step_0["start_time_us"][0] / 1000
6363
job_statistics["finalization"] = (
6464
max(self.sys_metrics_df["timestamp_us"]) - max(self.framework_metrics_df["end_time_us"])
6565
) / 1000
@@ -268,7 +268,9 @@ def get_training_phase_intervals(self, phase=None):
268268
)
269269
).reset_index(drop=True)
270270
else:
271-
mode_df = mode_df[["start_time_us", "end_time_us", "framework_metric"]]
271+
mode_df = mode_df[["start_time_us", "end_time_us", "framework_metric"]].reset_index(
272+
drop=True
273+
)
272274
mode_df.rename({"framework_metric": "phase"}, axis="columns", inplace=True)
273275

274276
for i in range(len(mode_df.index) - 1):
@@ -283,11 +285,11 @@ def get_training_phase_intervals(self, phase=None):
283285
"phase": "Between " + " and ".join(sorted([this_phase, next_phase])),
284286
}
285287
mode_df.loc[next_index] = row
286-
# need to revisit this. For PT jobs, index[0] is not 0
288+
287289
row = {
288290
"start_time_us": self.sys_metrics_df["timestamp_us"].min(),
289-
"end_time_us": mode_df["start_time_us"][mode_df.index[0]] - 1,
290-
"phase": "Before " + mode_df["phase"][mode_df.index[0]],
291+
"end_time_us": mode_df["start_time_us"][0] - 1,
292+
"phase": "Before " + mode_df["phase"][0],
291293
}
292294
mode_df.loc[-1] = row
293295
mode_df = mode_df.sort_index().reset_index(drop=True)

smdebug/profiler/analysis/utils/profiler_data_to_pandas.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717

1818
class PandasFrame:
19-
def __init__(self, path, use_in_memory_cache=False, scan_interval=50000000):
19+
def __init__(self, path, use_in_memory_cache=False, scan_interval=5000000000):
2020

2121
self.path = path
2222
self.step_time_mapping = dict()
@@ -288,6 +288,7 @@ def get_profiler_data_by_time(
288288
int(event.timestamp * CONVERT_TO_MICROSECS),
289289
event.value,
290290
event.name,
291+
event.dimension,
291292
]
292293
)
293294

@@ -297,7 +298,8 @@ def get_profiler_data_by_time(
297298

298299
# create data frame for system metrics
299300
system_metrics_df = pd.DataFrame(
300-
system_metrics, columns=["timestamp", "timestamp_us", "value", "system_metric"]
301+
system_metrics,
302+
columns=["timestamp", "timestamp_us", "value", "system_metric", "dimension"],
301303
)
302304

303305
system_metrics_df["timestamp_us"] = system_metrics_df["timestamp_us"] - self.start_time
@@ -384,7 +386,10 @@ def get_profiler_data_by_time(
384386
)
385387
framework_metrics_df["end_time_us"] = framework_metrics_df["end_time_us"] - self.start_time
386388

387-
return system_metrics_df, framework_metrics_df
389+
return (
390+
system_metrics_df[system_metrics_df.duplicated() == False],
391+
framework_metrics_df[framework_metrics_df.duplicated() == False],
392+
)
388393

389394
def get_profiler_data_by_step(self, start_step, end_step, cache_metrics=False):
390395
"""

0 commit comments

Comments
 (0)