NVIDIA
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎3rdparty/NVTX‎ b/‎3rdparty/NVTX‎
diff --git a/‎cpp/benchmarks/CMakeLists.txt‎ renamed to ‎benchmarks/cpp/CMakeLists.txt‎ b/‎cpp/benchmarks/CMakeLists.txt‎ renamed to ‎benchmarks/cpp/CMakeLists.txt‎
diff --git a/‎cpp/benchmarks/README.md‎ renamed to ‎benchmarks/cpp/README.md‎
Lines changed: 2 additions & 2 deletions b/‎cpp/benchmarks/README.md‎ renamed to ‎benchmarks/cpp/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/benchmarks/bertBenchmark.cpp‎ renamed to ‎benchmarks/cpp/bertBenchmark.cpp‎
Lines changed: 2 additions & 2 deletions b/‎cpp/benchmarks/bertBenchmark.cpp‎ renamed to ‎benchmarks/cpp/bertBenchmark.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/benchmarks/gptSessionBenchmark.cpp‎ renamed to ‎benchmarks/cpp/gptSessionBenchmark.cpp‎
Lines changed: 6 additions & 3 deletions b/‎cpp/benchmarks/gptSessionBenchmark.cpp‎ renamed to ‎benchmarks/cpp/gptSessionBenchmark.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎benchmarks/README.md‎ renamed to ‎benchmarks/python/README.md‎
Lines changed: 5 additions & 5 deletions b/‎benchmarks/README.md‎ renamed to ‎benchmarks/python/README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎benchmarks/allowed_configs.py‎ renamed to ‎benchmarks/python/allowed_configs.py‎
Lines changed: 90 additions & 8 deletions b/‎benchmarks/allowed_configs.py‎ renamed to ‎benchmarks/python/allowed_configs.py‎
Lines changed: 90 additions & 8 deletions
diff --git a/‎benchmarks/base_benchmark.py‎ renamed to ‎benchmarks/python/base_benchmark.py‎ b/‎benchmarks/base_benchmark.py‎ renamed to ‎benchmarks/python/base_benchmark.py‎
diff --git a/‎benchmarks/benchmark.py‎ renamed to ‎benchmarks/python/benchmark.py‎ b/‎benchmarks/benchmark.py‎ renamed to ‎benchmarks/python/benchmark.py‎
@@ -9,3 +9,6 @@
 	path = 3rdparty/cxxopts
 	url = https://github.com/jarro2783/cxxopts
 	branch = v3.1.1
+[submodule "3rdparty/NVTX"]
+	path = 3rdparty/NVTX
+	url = https://github.com/NVIDIA/NVTX.git
@@ -7,7 +7,7 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 ### 1. Build TensorRT-LLM and benchmarking source code
 
-Please follow the [`installation document`](../../README.md) to build TensorRT-LLM.
+Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.
 
 After that, you can build benchmarking source code for C++ runtime
 ```
@@ -19,7 +19,7 @@ make -j benchmarks
 
 Before you launch C++ benchmarking, please make sure that you have already built engine(s) using TensorRT-LLM API, C++ benchmarking code cannot generate engine(s) for you.
 
-You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../../benchmarks/README.md).
+You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../python/README.md).
 
 For detailed usage, you can do the following
 ```
 
@@ -15,13 +15,13 @@
  * limitations under the License.
  */
 #include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <NvInfer.h>
-#include <NvInferPlugin.h>
 #include <chrono>
 #include <cxxopts.hpp>
 #include <filesystem>
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
     {
         throw std::invalid_argument("Unexpected log level: " + logLevel);
     }
-    initLibNvInferPlugins(logger.get(), "tensorrt_llm");
+    initTrtLlmPlugins(logger.get());
 
     benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens, logger,
         result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());
 
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 #include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 
 #include <NvInfer.h>
-#include <NvInferPlugin.h>
 #include <chrono>
 #include <cxxopts.hpp>
 #include <iostream>
@@ -41,7 +41,10 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
     auto const json = GptJsonConfig::parse(dataPath / "config.json");
     auto const modelConfig = json.getModelConfig();
     auto const inputPacked = modelConfig.usePackedInput();
-    auto const worldConfig = WorldConfig::mpi(*logger);
+    SizeType deviceCount{0};
+    TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+    auto const worldConfig
+        = WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
     auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName);
     auto const dtype = modelConfig.getDataType();
     auto const useHalf = (dtype == nvinfer1::DataType::kHALF);
@@ -233,7 +236,7 @@ int main(int argc, char* argv[])
     // Argument: Enable CUDA graph
     auto enableCudaGraph = result.count("enable_cuda_graph") > 0;
 
-    initLibNvInferPlugins(logger.get(), "tensorrt_llm");
+    initTrtLlmPlugins(logger.get());
 
     benchmarkGptSession(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inOutLen,
         logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>(),
 
@@ -5,12 +5,12 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 ## Overview
 
-The benchmark implementation and entrypoint can be found in [`benchmarks/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
+The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
 
-* [`benchmarks/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
-* [`benchmarks/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
-* [`benchmarks/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
-* [`benchmarks/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
+* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
+* [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
+* [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
+* [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
 
 ## Usage
 
 
@@ -14,12 +14,12 @@
 # limitations under the License.
 from typing import Literal, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Extra
 
 from tensorrt_llm.functional import PositionEmbeddingType
 
 
-class BuildConfig(BaseModel):
+class BuildConfig(BaseModel, extra=Extra.allow):
     num_layers: int
     num_heads: int
     hidden_size: int
@@ -28,10 +28,10 @@ class BuildConfig(BaseModel):
     n_positions: int
     max_batch_size: int
     max_input_len: int
-    num_kv_heads: int = None
+    num_kv_heads: Optional[int] = None
     max_output_len: Optional[int] = None
-    builder_opt: Optional[
-        int] = None  # TRT builder_optimization_level from 0 to 5
+    # TRT builder_optimization_level from 0 to 5
+    builder_opt: Optional[int] = None
     inter_size: Optional[int] = None
     rotary_dim: Optional[int] = None
     type_vocab_size: Optional[int] = None
@@ -44,11 +44,10 @@ class BuildConfig(BaseModel):
     enable_context_fmha: bool = True
     # None means using the model family's default value defined in the ctor
     position_embedding_type: Optional[PositionEmbeddingType] = None
-    # Only when position embedding is RoPE, this value makes sense, make default value to be None, not 0 or 1
-    # to prevent misuse
+    # Only when position embedding is RoPE, this value makes sense, make
+    # default value to be None, not 0 or 1 to prevent misuse
     rotary_pct: Optional[float] = None
     bias: bool = True
-    remove_input_padding: bool = True
 
 
 class ModelConfig(BaseModel):
@@ -439,6 +438,89 @@ class ModelConfig(BaseModel):
                     enable_qk_half_accum=False,
                     enable_context_fmha=False,
                 )),
+    "falcon_rw_1b":
+    ModelConfig(name="falcon_rw_1b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=24,
+                    num_heads=32,
+                    hidden_size=2048,
+                    vocab_size=50304,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=256,
+                    max_input_len=1024,
+                    max_output_len=1024,
+                    builder_opt=None,
+                    bias=True,
+                    use_alibi=True,
+                    parallel_attention=False,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_7b":
+    ModelConfig(name="falcon_7b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=32,
+                    num_heads=71,
+                    num_kv_heads=1,
+                    hidden_size=4544,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=128,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_40b":
+    ModelConfig(name="falcon_40b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=60,
+                    num_heads=128,
+                    num_kv_heads=8,
+                    hidden_size=8192,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=64,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
+    "falcon_180b":
+    ModelConfig(name="falcon_180b",
+                family="falcon",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=80,
+                    num_heads=232,
+                    num_kv_heads=8,
+                    hidden_size=14848,
+                    vocab_size=65024,
+                    hidden_act=None,
+                    n_positions=2048,
+                    max_batch_size=8,
+                    max_input_len=1024,
+                    max_output_len=1024,
+                    builder_opt=None,
+                    bias=False,
+                    use_alibi=False,
+                    parallel_attention=True,
+                    new_decoder_architecture=False,
+                )),
 }