diff --git a/tools/perf/benchmark.sh b/tools/perf/benchmark.sh index 4a7cd24487..2770db3999 100644 --- a/tools/perf/benchmark.sh +++ b/tools/perf/benchmark.sh @@ -7,8 +7,8 @@ python hub.py batch_sizes=(1 2 4 8 16 32 64 128 256) large_model_batch_sizes=(1 2 4 8 16 32 64) -backends=("torch" "ts_trt" "dynamo" "torch_compile" "inductor") -backends_no_torchscript=("torch" "dynamo" "torch_compile" "inductor") +backends=("torch" "ts_trt" "dynamo" "torch_compile" "inductor" "tensorrt") +backends_no_torchscript=("torch" "dynamo" "torch_compile" "inductor" "tensorrt") # Benchmark VGG16 model diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 5a91831fe9..c52fb6ba56 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -15,6 +15,7 @@ # Importing supported Backends import torch +import torch_tensorrt as torchtrt from utils import ( BENCHMARK_MODELS, parse_backends, @@ -23,8 +24,6 @@ precision_to_dtype, ) -import torch_tensorrt as torchtrt - WARMUP_ITER = 10 results = [] @@ -294,29 +293,30 @@ def run_tensorrt( input_tensors, params, precision, - is_trt_engine=False, batch_size=1, ): - engine = None - - # If the model file is a TensorRT engine then directly deserialize and run inference - # else convert the torch module to a TensorRT engine first and then run inference - if not is_trt_engine: - compile_settings = { - "inputs": input_tensors, - "enabled_precisions": {precision_to_dtype(precision)}, - "truncate_long_and_double": params.get("truncate", False), - } - - print("Converting method to TensorRT engine...") - with torch.no_grad(), torchtrt.logging.errors(): - model = torchtrt.ts.convert_method_to_trt_engine( - model, "forward", **compile_settings - ) - + # Export an ONNX model and convert to TRT + torch.onnx.export(model.eval().cuda(), tuple(input_tensors), "./tmp.onnx") + logger = trt.Logger(trt.Logger.WARNING) + builder = trt.Builder(logger) + network = builder.create_network( + 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + ) + parser = trt.OnnxParser(network, logger) + success = parser.parse_from_file("./tmp.onnx") + if not success: + raise ValueError("ONNX conversion failed") + + config = builder.create_builder_config() + if precision == "fp16": + config.set_flag(trt.BuilderFlag.FP16) + start_compile = time.time_ns() + serialized_engine = builder.build_serialized_network(network, config) + end_compile = time.time_ns() + compile_time_s = (end_compile - start_compile) / 1e9 # Deserialize the TensorRT engine - with trt.Logger() as logger, trt.Runtime(logger) as runtime: - engine = runtime.deserialize_cuda_engine(model) + with trt.Runtime(logger) as runtime: + engine = runtime.deserialize_cuda_engine(serialized_engine) print("Running TensorRT for precision: ", precision, " batch_size : ", batch_size) iters = params.get("iterations", 20) @@ -351,7 +351,7 @@ def run_tensorrt( meas_time = end_time - start_time timings.append(meas_time) - recordStats("TensorRT", timings, precision, batch_size) + recordStats("TensorRT", timings, precision, batch_size, compile_time_s) # Deploys inference run for different backend configurations @@ -427,11 +427,10 @@ def run( ) elif backend == "tensorrt": run_tensorrt( - model, + model_torch, input_tensors, params, precision, - is_trt_engine, batch_size, ) elif backend == "dynamo": @@ -440,9 +439,6 @@ def run( elif backend == "torch_compile": run_torch_compile(model_torch, input_tensors, params, precision, batch_size) - elif backend == "torch_compile": - run_torch_compile(model_torch, input_tensors, params, precision, batch_size) - elif backend == "inductor": run_inductor(model_torch, input_tensors, params, precision, batch_size) diff --git a/tools/perf/requirements.txt b/tools/perf/requirements.txt index d204d3c335..159e6f5eab 100644 --- a/tools/perf/requirements.txt +++ b/tools/perf/requirements.txt @@ -1,7 +1,9 @@ numpy argparse pyyaml +onnx transformers==4.33.2 diffusers==0.21.4 pandas==2.0.1 timm==0.9.8 +