NVIDIA · kaiyux · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 11, 2023
diff --git a/docs/source/README.md → README.md b/docs/source/README.md → README.md
@@ -251,7 +251,14 @@ corresponds to your CUDA version.  As an example, for CUDA 12.1, use:
 pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
 ```
 
-[CUDA_ARCHITECTURES in CMake]: https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES
+- When building models, you might encounter memory-related issues. Such as:
+```
+[09/23/2023-03:13:00] [TRT] [E] 9: GPTLMHeadModel/layers/0/attention/qkv/PLUGIN_V2_Gemm_0: could not find any supported formats consistent with input/output data types
+[09/23/2023-03:13:00] [TRT] [E] 9: [pluginV2Builder.cpp::reportPluginError::24] Error Code 9: Internal Error (GPTLMHeadModel/layers/0/attention/qkv/PLUGIN_V2_Gemm_0: could not find any supported formats consistent with input/output data types)
+```
+You can reduce the memory pressure by lowering the maximum batch size, input and output lengths. Another option is to enable plugins, for example: `--use_gpt_attention_plugin`.
+
+- [CUDA_ARCHITECTURES in CMake]: https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES
 
 ## Release notes
 

diff --git a/benchmarks/cpp/CMakeLists.txt b/benchmarks/cpp/CMakeLists.txt
@@ -25,9 +25,17 @@ add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
 function(add_benchmark test_name test_src)
   add_executable(${test_name} ${test_src})
 
-  target_link_libraries(
-    ${test_name} PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm
-                        cxxopts::cxxopts)
+  if(NOT WIN32) # Linux
+    target_link_libraries(
+      ${test_name} PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm
+                          cxxopts::cxxopts)
+  else()
+    # Use STATIC_TARGET on Windows because MSVC is picky about duplicate symbols
+    # if the shared and static libs both get linked
+    target_link_libraries(
+      ${test_name} PUBLIC ${STATIC_TARGET} nvinfer_plugin_tensorrt_llm
+                          cxxopts::cxxopts)
+  endif()
 
   target_compile_features(${test_name} PRIVATE cxx_std_17)
   target_compile_definitions(${test_name}
@@ -37,3 +45,4 @@ endfunction()
 
 add_benchmark(gptSessionBenchmark gptSessionBenchmark.cpp)
 add_benchmark(bertBenchmark bertBenchmark.cpp)
+add_benchmark(gptManagerBenchmark gptManagerBenchmark.cpp)
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -15,7 +15,7 @@ cd cpp/build
 make -j benchmarks
 ```
 
-### 2. Launch C++ benchmarking
+### 2. Launch C++ benchmarking (Fixed BatchSize/InputLen/OutputLen)
 
 Before you launch C++ benchmarking, please make sure that you have already built engine(s) using TensorRT-LLM API, C++ benchmarking code cannot generate engine(s) for you.
 
@@ -55,3 +55,49 @@ mpirun -n 8 ./benchmarks/gptSessionBenchmark \
 ```
 
 *Please note that the expected outputs in that document are only for reference, specific performance numbers depend on the GPU you're using.*
+
+### 3. Launch Batch Manager benchmarking (Inflight/V1 batching)
+
+#### Prepare dataset
+
+Run a preprocessing script to prepare dataset. This script converts the prompts(string) in the dataset to input_ids.
+```
+python3 prepare_dataset.py \
+    --dataset <path/to/dataset> \
+    --max_input_len 300 \
+    --tokenizer_dir <path/to/tokenizer> \
+    --tokenizer_type auto \
+    --output preprocessed_dataset.json
+```
+For `tokenizer_dir`, specifying the path to the local tokenizer that have already been downloaded, or simply the name of the tokenizer from HuggingFace like `gpt2` will both work. The tokenizer will be downloaded automatically for the latter case.
+
+#### Prepare TensorRT-LLM engines
+Please make sure that the engines are built with argument `--use_inflight_batching` and `--remove_input_padding` if you'd like to benchmark inflight batching, for more details, please see the document in TensorRT-LLM examples.
+
+#### Launch benchmarking
+
+For detailed usage, you can do the following
+```
+cd cpp/build
+
+# You can directly execute the binary for help information
+./benchmarks/gptManagerBenchmark --help
+```
+
+Take GPT-350M as an example for single GPU V1 batching
+```
+./benchmarks/gptManagerBenchmark \
+    --model gpt \
+    --engine_dir ../../examples/gpt/trt_engine/gpt2/fp16/1-gpu/ \
+    --type V1 \
+    --dataset ../../benchmarks/cpp/preprocessed_dataset.json
+```
+
+Take GPT-350M as an example for 2-GPU inflight batching
+```
+mpirun -n 2 ./benchmarks/gptManagerBenchmark \
+    --model gpt \
+    --engine_dir ../../examples/gpt/trt_engine/gpt2-ib/fp16/2-gpu/ \
+    --type IFB \
+    --dataset ../../benchmarks/cpp/preprocessed_dataset.json
+```
diff --git a/benchmarks/cpp/bertBenchmark.cpp b/benchmarks/cpp/bertBenchmark.cpp
@@ -78,7 +78,7 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
 {
     auto const worldConfig = WorldConfig::mpi(*logger);
     auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
-    auto engineBlob = loadEngine(enginePath);
+    auto engineBlob = loadEngine(enginePath.string());
 
     auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), *logger);
     rt->addContext(0);
@@ -180,7 +180,8 @@ int main(int argc, char* argv[])
     if (!result.count("engine_dir"))
     {
         std::cout << options.help() << std::endl;
-        throw std::invalid_argument("Please specify engine directory.");
+        TLLM_LOG_ERROR("Please specify engine directory.");
+        return 1;
     }
 
     // Argument: Batch sizes
@@ -226,11 +227,20 @@ int main(int argc, char* argv[])
     }
     else
     {
-        throw std::invalid_argument("Unexpected log level: " + logLevel);
+        TLLM_LOG_ERROR("Unexpected log level: " + logLevel);
+        return 1;
     }
     initTrtLlmPlugins(logger.get());
 
-    benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens, logger,
-        result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());
+    try
+    {
+        benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,
+            logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());
+    }
+    catch (const std::exception& e)
+    {
+        TLLM_LOG_ERROR(e.what());
+        return 1;
+    }
     return 0;
 }