|
10 | 10 | "bento/extensions/theme/main.css": true |
11 | 11 | }, |
12 | 12 | "kernelspec": { |
13 | | - "display_name": "accelerators", |
| 13 | + "display_name": "dper3_pytorch (cinder)", |
14 | 14 | "language": "python", |
15 | | - "name": "bento_kernel_accelerators", |
| 15 | + "name": "bento_kernel_dper3_pytorch_cinder", |
16 | 16 | "metadata": { |
17 | | - "kernel_name": "bento_kernel_accelerators", |
18 | | - "nightly_builds": true, |
| 17 | + "kernel_name": "bento_kernel_dper3_pytorch_cinder", |
| 18 | + "nightly_builds": false, |
19 | 19 | "fbpkg_supported": true, |
20 | | - "cinder_runtime": false, |
| 20 | + "cinder_runtime": true, |
21 | 21 | "is_prebuilt": true |
22 | 22 | } |
23 | 23 | }, |
|
32 | 32 | "nbconvert_exporter": "python", |
33 | 33 | "pygments_lexer": "ipython3" |
34 | 34 | }, |
35 | | - "last_server_session_id": "c6f6ab3c-9274-41e7-8592-b1b583442e00", |
36 | | - "last_kernel_id": "fcbf3a69-76a4-4730-9b41-bcd0b24729ca", |
37 | | - "last_base_url": "https://devgpu005.ftw6.facebook.com:8093/", |
38 | | - "last_msg_id": "e28f842c-f32dde25c1b80ef7d423dfee_407", |
| 35 | + "last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58", |
| 36 | + "last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202", |
| 37 | + "last_base_url": "https://devgpu005.ftw6.facebook.com:8091/", |
| 38 | + "last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139", |
39 | 39 | "outputWidgetContext": {} |
40 | 40 | }, |
41 | 41 | "nbformat": 4, |
|
58 | 58 | { |
59 | 59 | "cell_type": "code", |
60 | 60 | "metadata": { |
61 | | - "originalKey": "7909785f-b9b4-41dd-82af-c144b879df39", |
| 61 | + "originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd", |
62 | 62 | "showInput": true, |
63 | 63 | "customInput": null, |
64 | 64 | "collapsed": false, |
65 | | - "requestMsgId": "7db2accc-9fa4-4a1e-8142-d887f2947bcd", |
| 65 | + "requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424", |
66 | 66 | "customOutput": null, |
67 | | - "executionStartTime": 1656395936225, |
68 | | - "executionStopTime": 1656395937851 |
| 67 | + "executionStartTime": 1661189891682, |
| 68 | + "executionStopTime": 1661189891856 |
69 | 69 | }, |
70 | 70 | "source": [ |
71 | 71 | "import typing as t\n", |
|
74 | 74 | "\n", |
75 | 75 | "import torch\n", |
76 | 76 | "import torchvision\n", |
77 | | - "from torch_tensorrt.fx.lower import lower_to_trt\n", |
| 77 | + "from torch_tensorrt.fx.lower import compile\n", |
78 | 78 | "from torch_tensorrt.fx.utils import LowerPrecision" |
79 | 79 | ], |
80 | | - "execution_count": 4, |
| 80 | + "execution_count": 9, |
81 | 81 | "outputs": [] |
82 | 82 | }, |
83 | 83 | { |
|
98 | 98 | { |
99 | 99 | "cell_type": "code", |
100 | 100 | "metadata": { |
101 | | - "originalKey": "a4455135-8633-4d2d-bdd3-6435a4a9f4dd", |
| 101 | + "originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726", |
102 | 102 | "showInput": true, |
103 | 103 | "customInput": null, |
104 | 104 | "code_folding": [], |
105 | 105 | "hidden_ranges": [], |
106 | 106 | "collapsed": false, |
107 | | - "requestMsgId": "2835fffa-cc50-479a-9080-c4f7002c0726", |
| 107 | + "requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5", |
108 | 108 | "customOutput": null, |
109 | | - "executionStartTime": 1656398717455, |
110 | | - "executionStopTime": 1656398717662 |
| 109 | + "executionStartTime": 1661189260550, |
| 110 | + "executionStopTime": 1661189262039 |
111 | 111 | }, |
112 | 112 | "source": [ |
113 | 113 | "@dataclass\n", |
|
159 | 159 | " f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n", |
160 | 160 | " )" |
161 | 161 | ], |
162 | | - "execution_count": 22, |
163 | | - "outputs": [] |
| 162 | + "execution_count": 2, |
| 163 | + "outputs": [ |
| 164 | + { |
| 165 | + "output_type": "stream", |
| 166 | + "name": "stderr", |
| 167 | + "text": [ |
| 168 | + "I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n" |
| 169 | + ] |
| 170 | + }, |
| 171 | + { |
| 172 | + "output_type": "stream", |
| 173 | + "name": "stderr", |
| 174 | + "text": [ |
| 175 | + "I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n" |
| 176 | + ] |
| 177 | + } |
| 178 | + ] |
164 | 179 | }, |
165 | 180 | { |
166 | 181 | "cell_type": "markdown", |
167 | 182 | "metadata": { |
168 | 183 | "originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b", |
169 | | - "showInput": true, |
| 184 | + "showInput": false, |
170 | 185 | "customInput": null, |
171 | 186 | "code_folding": [], |
172 | 187 | "hidden_ranges": [] |
173 | 188 | }, |
174 | 189 | "source": [ |
175 | 190 | "Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n", |
176 | | - "The FX path lowering and TensorRT engine creation is integrated into `low_to_trt()` API which is defined in `fx/lower.py` file.\n", |
| 191 | + "The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n", |
177 | 192 | "It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n", |
178 | 193 | "```\n", |
179 | | - "def lower_to_trt(\n", |
| 194 | + "def compile(\n", |
180 | 195 | " module: nn.Module,\n", |
181 | 196 | " input: ,\n", |
182 | 197 | " max_batch_size: int = 2048,\n", |
|
212 | 227 | { |
213 | 228 | "cell_type": "code", |
214 | 229 | "metadata": { |
215 | | - "originalKey": "91333212-7f6d-4bde-a248-44d485e83e5e", |
| 230 | + "originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b", |
216 | 231 | "showInput": true, |
217 | 232 | "customInput": null, |
218 | 233 | "code_folding": [], |
219 | 234 | "hidden_ranges": [], |
220 | 235 | "collapsed": false, |
221 | | - "requestMsgId": "3002935b-b95a-4a08-a57f-f7a35485af5b", |
| 236 | + "requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0", |
222 | 237 | "customOutput": null, |
223 | | - "executionStartTime": 1656397903207, |
224 | | - "executionStopTime": 1656397964752 |
| 238 | + "executionStartTime": 1661189697773, |
| 239 | + "executionStopTime": 1661189753875 |
225 | 240 | }, |
226 | 241 | "source": [ |
227 | | - "test_model = torchvision.models.resnet18(pretrained=True)\n", |
228 | | - "input = [torch.rand(128, 3, 224, 224)] \n", |
229 | | - "benchmark(test_model, input, 50, 128)\n", |
230 | | - "\n", |
231 | 242 | "def benchmark_torch_function(iters: int, f, *args) -> float:\n", |
232 | 243 | " \"\"\"Estimates the average time duration for a single inference call in second\n", |
233 | 244 | "\n", |
|
266 | 277 | " time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n", |
267 | 278 | " elif not conf.jit:\n", |
268 | 279 | " # Run lowering eager mode benchmark\n", |
269 | | - " lowered_module = lower_to_trt(\n", |
| 280 | + " lowered_module = compile(\n", |
270 | 281 | " module,\n", |
271 | 282 | " input,\n", |
272 | 283 | " max_batch_size=conf.batch_size,\n", |
|
279 | 290 | " result = Result(module=module, input=input, conf=conf, time_sec=time)\n", |
280 | 291 | " return result\n", |
281 | 292 | "\n", |
| 293 | + "\n", |
282 | 294 | "@torch.inference_mode()\n", |
283 | 295 | "def benchmark(\n", |
284 | 296 | " model,\n", |
|
315 | 327 | " ),\n", |
316 | 328 | " ]\n", |
317 | 329 | "\n", |
318 | | - " results = [\n", |
319 | | - " run_configuration_benchmark(deepcopy(model), inputs, conf_)\n", |
320 | | - " for conf_ in configurations\n", |
321 | | - " ]\n", |
| 330 | + " results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n", |
322 | 331 | "\n", |
323 | 332 | " for res in results:\n", |
324 | | - " print(res.format())" |
| 333 | + " print(res.format())\n", |
| 334 | + "\n", |
| 335 | + "\n", |
| 336 | + "test_model = torchvision.models.resnet18(pretrained=True)\n", |
| 337 | + "input = [torch.rand(128, 3, 224, 224)]\n", |
| 338 | + "benchmark(test_model, input, 50, 128)" |
325 | 339 | ], |
326 | | - "execution_count": 21, |
| 340 | + "execution_count": 8, |
327 | 341 | "outputs": [ |
| 342 | + { |
| 343 | + "output_type": "stream", |
| 344 | + "name": "stderr", |
| 345 | + "text": [ |
| 346 | + "I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n" |
| 347 | + ] |
| 348 | + }, |
328 | 349 | { |
329 | 350 | "output_type": "stream", |
330 | 351 | "name": "stdout", |
|
339 | 360 | "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n" |
340 | 361 | ] |
341 | 362 | }, |
| 363 | + { |
| 364 | + "output_type": "stream", |
| 365 | + "name": "stderr", |
| 366 | + "text": [ |
| 367 | + "I0822 103501.297 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpe_7p37fq\n" |
| 368 | + ] |
| 369 | + }, |
| 370 | + { |
| 371 | + "output_type": "stream", |
| 372 | + "name": "stderr", |
| 373 | + "text": [ |
| 374 | + "I0822 103501.390 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpg_a347f0\n" |
| 375 | + ] |
| 376 | + }, |
| 377 | + { |
| 378 | + "output_type": "stream", |
| 379 | + "name": "stderr", |
| 380 | + "text": [ |
| 381 | + "I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" |
| 382 | + ] |
| 383 | + }, |
| 384 | + { |
| 385 | + "output_type": "stream", |
| 386 | + "name": "stderr", |
| 387 | + "text": [ |
| 388 | + "I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" |
| 389 | + ] |
| 390 | + }, |
342 | 391 | { |
343 | 392 | "output_type": "stream", |
344 | 393 | "name": "stdout", |
345 | 394 | "text": [ |
346 | | - "== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpaayayg72\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpdw_pq71j\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" |
| 395 | + "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" |
347 | 396 | ] |
348 | 397 | }, |
349 | 398 | { |
350 | 399 | "output_type": "stream", |
351 | 400 | "name": "stderr", |
352 | 401 | "text": [ |
353 | | - "I0627 233146.650 fx2trt.py:190] Run Module elapsed time: 0:00:00.244369\n" |
| 402 | + "I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n" |
354 | 403 | ] |
355 | 404 | }, |
356 | 405 | { |
357 | 406 | "output_type": "stream", |
358 | 407 | "name": "stderr", |
359 | 408 | "text": [ |
360 | | - "I0627 233206.570 fx2trt.py:241] Build TRT engine elapsed time: 0:00:19.918630\n" |
| 409 | + "I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n" |
| 410 | + ] |
| 411 | + }, |
| 412 | + { |
| 413 | + "output_type": "stream", |
| 414 | + "name": "stderr", |
| 415 | + "text": [ |
| 416 | + "I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n" |
361 | 417 | ] |
362 | 418 | }, |
363 | 419 | { |
|
374 | 430 | "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n" |
375 | 431 | ] |
376 | 432 | }, |
| 433 | + { |
| 434 | + "output_type": "stream", |
| 435 | + "name": "stderr", |
| 436 | + "text": [ |
| 437 | + "I0822 103523.067 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpgphlicna\n" |
| 438 | + ] |
| 439 | + }, |
| 440 | + { |
| 441 | + "output_type": "stream", |
| 442 | + "name": "stderr", |
| 443 | + "text": [ |
| 444 | + "I0822 103523.106 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpy9cumddi\n" |
| 445 | + ] |
| 446 | + }, |
| 447 | + { |
| 448 | + "output_type": "stream", |
| 449 | + "name": "stderr", |
| 450 | + "text": [ |
| 451 | + "I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" |
| 452 | + ] |
| 453 | + }, |
| 454 | + { |
| 455 | + "output_type": "stream", |
| 456 | + "name": "stderr", |
| 457 | + "text": [ |
| 458 | + "I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" |
| 459 | + ] |
| 460 | + }, |
377 | 461 | { |
378 | 462 | "output_type": "stream", |
379 | 463 | "name": "stdout", |
380 | 464 | "text": [ |
381 | | - "== Log pass <function fuse_permute_matmul at 0x7fbdfcc9f1f0> before/after graph to /tmp/tmpnoeblgd5\n== Log pass <function fuse_permute_linear at 0x7fbe36555f70> before/after graph to /tmp/tmpyb1egsof\n\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" |
| 465 | + "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" |
| 466 | + ] |
| 467 | + }, |
| 468 | + { |
| 469 | + "output_type": "stream", |
| 470 | + "name": "stderr", |
| 471 | + "text": [ |
| 472 | + "I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n" |
382 | 473 | ] |
383 | 474 | }, |
384 | 475 | { |
385 | 476 | "output_type": "stream", |
386 | 477 | "name": "stderr", |
387 | 478 | "text": [ |
388 | | - "I0627 233208.996 fx2trt.py:190] Run Module elapsed time: 0:00:00.217076\n" |
| 479 | + "I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n" |
389 | 480 | ] |
390 | 481 | }, |
391 | 482 | { |
392 | 483 | "output_type": "stream", |
393 | 484 | "name": "stderr", |
394 | 485 | "text": [ |
395 | | - "I0627 233244.147 fx2trt.py:241] Build TRT engine elapsed time: 0:00:35.150950\n" |
| 486 | + "I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n" |
396 | 487 | ] |
397 | 488 | }, |
398 | 489 | { |
|
406 | 497 | "output_type": "stream", |
407 | 498 | "name": "stdout", |
408 | 499 | "text": [ |
409 | | - "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 15.00ms, QPS: 8530.72, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.95ms, QPS: 16098.45, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.36ms, QPS: 29365.31, Accuracy: None (rtol=0.01)\n" |
| 500 | + "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n" |
410 | 501 | ] |
411 | 502 | } |
412 | 503 | ] |
|
0 commit comments