|
| 1 | +#include "torch/script.h" |
| 2 | +#include "torch/torch.h" |
| 3 | +#include "ATen/Context.h" |
| 4 | +#include "c10/cuda/CUDACachingAllocator.h" |
| 5 | +#include "trtorch/trtorch.h" |
| 6 | +#include "cuda_runtime_api.h" |
| 7 | + |
| 8 | +#include "timer.h" |
| 9 | + |
| 10 | +#define NUM_WARMUP_RUNS 20 |
| 11 | +#define NUM_RUNS 100 |
| 12 | + |
| 13 | +// Benchmaking code |
| 14 | +void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size) { |
| 15 | + float avg_runtime = std::accumulate(runtimes.begin(), runtimes.end(), 0.0) / runtimes.size(); |
| 16 | + float fps = (1000.f / avg_runtime) * batch_size; |
| 17 | + std::cout << "[" << type << "]: batch_size: " << batch_size << "\n Average latency: " << avg_runtime << " ms\n Average FPS: " << fps << " fps" <<std::endl; |
| 18 | + |
| 19 | + std::vector<float> rt_diff(runtimes.size()); |
| 20 | + std::transform(runtimes.begin(), runtimes.end(), rt_diff.begin(), [avg_runtime](float x) { return x - avg_runtime; }); |
| 21 | + float rt_sq_sum = std::inner_product(rt_diff.begin(), rt_diff.end(), rt_diff.begin(), 0.0); |
| 22 | + float rt_std_dev = std::sqrt(rt_sq_sum / runtimes.size()); |
| 23 | + |
| 24 | + std::vector<float> fps_diff(runtimes.size()); |
| 25 | + std::transform(runtimes.begin(), runtimes.end(), fps_diff.begin(), [fps, batch_size](float x) { return ((1000.f / x) * batch_size) - fps; }); |
| 26 | + float fps_sq_sum = std::inner_product(fps_diff.begin(), fps_diff.end(), fps_diff.begin(), 0.0); |
| 27 | + float fps_std_dev = std::sqrt(fps_sq_sum / runtimes.size()); |
| 28 | + std::cout << " Latency Standard Deviation: " << rt_std_dev << "\n FPS Standard Deviation: " << fps_std_dev << "\n(excluding initial warmup runs)" << std::endl; |
| 29 | +} |
| 30 | + |
| 31 | +std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape) { |
| 32 | + auto execution_timer = timers::PreciseCPUTimer(); |
| 33 | + std::vector<float> execution_runtimes; |
| 34 | + |
| 35 | + for (uint64_t i = 0; i < NUM_WARMUP_RUNS; i++) { |
| 36 | + std::vector<torch::jit::IValue> inputs_ivalues; |
| 37 | + auto in = at::rand(shape, {at::kCUDA}); |
| 38 | +#ifdef HALF |
| 39 | + in = in.to(torch::kHalf); |
| 40 | +#endif |
| 41 | + inputs_ivalues.push_back(in.clone()); |
| 42 | + |
| 43 | + cudaDeviceSynchronize(); |
| 44 | + mod.forward(inputs_ivalues); |
| 45 | + cudaDeviceSynchronize(); |
| 46 | + |
| 47 | + } |
| 48 | + |
| 49 | + for (uint64_t i = 0; i < NUM_RUNS; i++) { |
| 50 | + std::vector<torch::jit::IValue> inputs_ivalues; |
| 51 | + auto in = at::rand(shape, {at::kCUDA}); |
| 52 | +#ifdef HALF |
| 53 | + in = in.to(torch::kHalf); |
| 54 | +#endif |
| 55 | + inputs_ivalues.push_back(in.clone()); |
| 56 | + cudaDeviceSynchronize(); |
| 57 | + |
| 58 | + execution_timer.start(); |
| 59 | + mod.forward(inputs_ivalues); |
| 60 | + cudaDeviceSynchronize(); |
| 61 | + execution_timer.stop(); |
| 62 | + |
| 63 | + auto time = execution_timer.milliseconds(); |
| 64 | + execution_timer.reset(); |
| 65 | + execution_runtimes.push_back(time); |
| 66 | + |
| 67 | + c10::cuda::CUDACachingAllocator::emptyCache(); |
| 68 | + } |
| 69 | + return execution_runtimes; |
| 70 | +} |
0 commit comments