NVIDIA · kaiyux · Sep 28, 2023 · Sep 28, 2023 · Sep 28, 2023 · Sep 28, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -9,3 +9,6 @@
 	path = 3rdparty/cxxopts
 	url = https://github.com/jarro2783/cxxopts
 	branch = v3.1.1
+[submodule "3rdparty/NVTX"]
+	path = 3rdparty/NVTX
+	url = https://github.com/NVIDIA/NVTX.git
diff --git a/3rdparty/NVTX b/3rdparty/NVTX
diff --git a/cpp/benchmarks/CMakeLists.txt → benchmarks/cpp/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt → benchmarks/cpp/CMakeLists.txt
diff --git a/cpp/benchmarks/README.md → benchmarks/cpp/README.md b/cpp/benchmarks/README.md → benchmarks/cpp/README.md
@@ -7,7 +7,7 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 ### 1. Build TensorRT-LLM and benchmarking source code
 
-Please follow the [`installation document`](../../README.md) to build TensorRT-LLM.
+Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.
 
 After that, you can build benchmarking source code for C++ runtime
 ```
@@ -19,7 +19,7 @@ make -j benchmarks
 
 Before you launch C++ benchmarking, please make sure that you have already built engine(s) using TensorRT-LLM API, C++ benchmarking code cannot generate engine(s) for you.
 
-You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../../benchmarks/README.md).
+You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../python/README.md).
 
 For detailed usage, you can do the following
 ```

diff --git a/cpp/benchmarks/bertBenchmark.cpp → benchmarks/cpp/bertBenchmark.cpp b/cpp/benchmarks/bertBenchmark.cpp → benchmarks/cpp/bertBenchmark.cpp
@@ -15,13 +15,13 @@
  * limitations under the License.
  */
 #include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <NvInfer.h>
-#include <NvInferPlugin.h>
 #include <chrono>
 #include <cxxopts.hpp>
 #include <filesystem>
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
     {
         throw std::invalid_argument("Unexpected log level: " + logLevel);
     }
-    initLibNvInferPlugins(logger.get(), "tensorrt_llm");
+    initTrtLlmPlugins(logger.get());
 
     benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens, logger,
         result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());

diff --git a/cpp/benchmarks/gptSessionBenchmark.cpp → benchmarks/cpp/gptSessionBenchmark.cpp b/cpp/benchmarks/gptSessionBenchmark.cpp → benchmarks/cpp/gptSessionBenchmark.cpp
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 #include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 
 #include <NvInfer.h>
-#include <NvInferPlugin.h>
 #include <chrono>
 #include <cxxopts.hpp>
 #include <iostream>
@@ -41,7 +41,10 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
     auto const json = GptJsonConfig::parse(dataPath / "config.json");
     auto const modelConfig = json.getModelConfig();
     auto const inputPacked = modelConfig.usePackedInput();
-    auto const worldConfig = WorldConfig::mpi(*logger);
+    SizeType deviceCount{0};
+    TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+    auto const worldConfig
+        = WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
     auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName);
     auto const dtype = modelConfig.getDataType();
     auto const useHalf = (dtype == nvinfer1::DataType::kHALF);
@@ -233,7 +236,7 @@ int main(int argc, char* argv[])
     // Argument: Enable CUDA graph
     auto enableCudaGraph = result.count("enable_cuda_graph") > 0;
 
-    initLibNvInferPlugins(logger.get(), "tensorrt_llm");
+    initTrtLlmPlugins(logger.get());
 
     benchmarkGptSession(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inOutLen,
         logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>(),

diff --git a/benchmarks/README.md → benchmarks/python/README.md b/benchmarks/README.md → benchmarks/python/README.md
@@ -5,12 +5,12 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 ## Overview
 
-The benchmark implementation and entrypoint can be found in [`benchmarks/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
+The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
 
-* [`benchmarks/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
-* [`benchmarks/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
-* [`benchmarks/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
-* [`benchmarks/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
+* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
+* [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
+* [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
+* [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
 
 ## Usage
 

diff --git a/benchmarks/allowed_configs.py → benchmarks/python/allowed_configs.py b/benchmarks/allowed_configs.py → benchmarks/python/allowed_configs.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 from typing import Literal, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Extra
 
 from tensorrt_llm.functional import PositionEmbeddingType
 
 
-class BuildConfig(BaseModel):
+class BuildConfig(BaseModel, extra=Extra.allow):
     num_layers: int
     num_heads: int
     hidden_size: int
@@ -28,10 +28,10 @@ class BuildConfig(BaseModel):
     n_positions: int
     max_batch_size: int
     max_input_len: int
-    num_kv_heads: int = None
+    num_kv_heads: Optional[int] = None
     max_output_len: Optional[int] = None
-    builder_opt: Optional[
-        int] = None  # TRT builder_optimization_level from 0 to 5
+    # TRT builder_optimization_level from 0 to 5
+    builder_opt: Optional[int] = None
     inter_size: Optional[int] = None
     rotary_dim: Optional[int] = None
     type_vocab_size: Optional[int] = None
@@ -44,11 +44,10 @@ class BuildConfig(BaseModel):
     enable_context_fmha: bool = True
     # None means using the model family's default value defined in the ctor
     position_embedding_type: Optional[PositionEmbeddingType] = None
-    # Only when position embedding is RoPE, this value makes sense, make default value to be None, not 0 or 1
-    # to prevent misuse
+    # Only when position embedding is RoPE, this value makes sense, make
+    # default value to be None, not 0 or 1 to prevent misuse
     rotary_pct: Optional[float] = None
     bias: bool = True
-    remove_input_padding: bool = True
 
 
 class ModelConfig(BaseModel):
@@ -439,6 +438,89 @@ class ModelConfig(BaseModel):
                     enable_qk_half_accum=False,
                     enable_context_fmha=False,
                 )),
+    "falcon_rw_1b":
+    ModelConfig(name="falcon_rw_1b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=24,
+                    num_heads=32,
+                    hidden_size=2048,
+                    vocab_size=50304,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=256,
+                    max_input_len=1024,
+                    max_output_len=1024,
+                    builder_opt=None,
+                    bias=True,
+                    use_alibi=True,
+                    parallel_attention=False,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_7b":
+    ModelConfig(name="falcon_7b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=32,
+                    num_heads=71,
+                    num_kv_heads=1,
+                    hidden_size=4544,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=128,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_40b":
+    ModelConfig(name="falcon_40b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=60,
+                    num_heads=128,
+                    num_kv_heads=8,
+                    hidden_size=8192,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=64,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_180b":
+    ModelConfig(name="falcon_180b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=80,
+                    num_heads=232,
+                    num_kv_heads=8,
+                    hidden_size=14848,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=8,
+                    max_input_len=1024,
+                    max_output_len=1024,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
 }
 
 

diff --git a/benchmarks/base_benchmark.py → benchmarks/python/base_benchmark.py b/benchmarks/base_benchmark.py → benchmarks/python/base_benchmark.py
diff --git a/benchmarks/benchmark.py → benchmarks/python/benchmark.py b/benchmarks/benchmark.py → benchmarks/python/benchmark.py
diff --git a/benchmarks/bert_benchmark.py → benchmarks/python/bert_benchmark.py b/benchmarks/bert_benchmark.py → benchmarks/python/bert_benchmark.py
diff --git a/benchmarks/gpt_benchmark.py → benchmarks/python/gpt_benchmark.py b/benchmarks/gpt_benchmark.py → benchmarks/python/gpt_benchmark.py
@@ -81,22 +81,24 @@ def __init__(self,
             self.per_token = False
             self.per_channel = False
 
-            self.use_gpt_attention_plugin = False
-            self.use_gemm_plugin = False
-            self.use_layernorm_plugin = False
-            self.use_rmsnorm_plugin = False
-            self.use_lookup_plugin = False
+            is_plugin_mode = mode == 'plugin'
+            plg_dtype = dtype if is_plugin_mode else False
+            self.use_gpt_attention_plugin = plg_dtype
+            self.use_gemm_plugin = plg_dtype
+            self.use_layernorm_plugin = plg_dtype
+            # Enable RMS Norm plugin for the LLaMA family.
+            if is_plugin_mode and 'llama' in model_name:
+                self.use_rmsnorm_plugin = dtype
+            else:
+                self.use_rmsnorm_plugin = False
+            self.use_lookup_plugin = plg_dtype
             self.enable_context_fmha = True
             self.quant_mode = QuantMode(0)
-            if mode == 'plugin':
-                self.use_gpt_attention_plugin = dtype
-                self.use_gemm_plugin = dtype
-                self.use_layernorm_plugin = dtype
-                self.use_lookup_plugin = dtype
-                if "llama" in model_name:
-                    self.use_rmsnorm_plugin = dtype
+            self.remove_input_padding = is_plugin_mode
+
             for key, value in get_build_config(model_name).items():
                 setattr(self, key, value)
+
             # Override the n_position/max_input_len/max_output_len/max_batch_size to value from cmd line if that's specified.
             if n_positions is not None:
                 assert isinstance(
@@ -122,6 +124,7 @@ def __init__(self,
                 self.num_kv_heads = self.num_heads
             if kwargs.get('force_num_layer_1', False):
                 self.num_layers = 1
+
             if self.use_smooth_quant:
                 self.quant_mode = QuantMode.use_smooth_quant(
                     self.per_token, self.per_channel)
@@ -195,7 +198,7 @@ def prepare_inputs(self, config):
         input_lengths = torch.tensor([inlen
                                       for _ in range(batch_size)]).int().cuda()
 
-        self.decoder.setup(batch_size, inlen, outlen)
+        self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams)
         return (input_ids, input_lengths)
 
     def build(self):
@@ -334,6 +337,21 @@ def build(self):
                     world_size=self.world_size,
                     tp_size=self.world_size),  # TP only
                 use_parallel_embedding=(self.model_name == 'bloom_176b'))
+        elif family == "falcon":
+            tensorrt_llm_model = tensorrt_llm.models.FalconForCausalLM(
+                num_layers=self.num_layers,
+                num_heads=self.num_heads,
+                num_kv_heads=self.num_kv_heads,
+                hidden_size=self.hidden_size,
+                vocab_size=self.vocab_size,
+                max_position_embeddings=self.n_positions,
+                dtype=kv_dtype,
+                bias=self.bias,
+                use_alibi=self.use_alibi,
+                new_decoder_architecture=self.new_decoder_architecture,
+                parallel_attention=self.parallel_attention,
+                mapping=tensorrt_llm.Mapping(world_size=self.world_size,
+                                             tp_size=self.world_size))
         else:
             raise Exception(f'Unexpected model: {self.model_name}')
 
@@ -429,7 +447,7 @@ def build(self):
 
     def run(self, inputs, config):
         batch_size, inlen, outlen = config[0], config[1], config[2]
-        self.decoder.setup(batch_size, inlen, outlen)
+        self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams)
         if self.remove_input_padding:
             self.decoder.decode_batch(inputs[0], self.sampling_config)
         else:

diff --git a/benchmarks/mem_monitor.py → benchmarks/python/mem_monitor.py b/benchmarks/mem_monitor.py → benchmarks/python/mem_monitor.py