Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@
path = 3rdparty/cxxopts
url = https://github.com/jarro2783/cxxopts
branch = v3.1.1
[submodule "3rdparty/NVTX"]
path = 3rdparty/NVTX
url = https://github.com/NVIDIA/NVTX.git
1 change: 1 addition & 0 deletions 3rdparty/NVTX
Submodule NVTX added at a1ceb0
File renamed without changes.
4 changes: 2 additions & 2 deletions cpp/benchmarks/README.md → benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ multiple GPUs or multiple nodes with multiple GPUs.

### 1. Build TensorRT-LLM and benchmarking source code

Please follow the [`installation document`](../../README.md) to build TensorRT-LLM.
Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.

After that, you can build benchmarking source code for C++ runtime
```
Expand All @@ -19,7 +19,7 @@ make -j benchmarks

Before you launch C++ benchmarking, please make sure that you have already built engine(s) using TensorRT-LLM API, C++ benchmarking code cannot generate engine(s) for you.

You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../../benchmarks/README.md).
You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../python/README.md).

For detailed usage, you can do the following
```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/tllmLogger.h"
#include "tensorrt_llm/runtime/tllmRuntime.h"
#include "tensorrt_llm/runtime/worldConfig.h"

#include <NvInfer.h>
#include <NvInferPlugin.h>
#include <chrono>
#include <cxxopts.hpp>
#include <filesystem>
Expand Down Expand Up @@ -228,7 +228,7 @@ int main(int argc, char* argv[])
{
throw std::invalid_argument("Unexpected log level: " + logLevel);
}
initLibNvInferPlugins(logger.get(), "tensorrt_llm");
initTrtLlmPlugins(logger.get());

benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens, logger,
result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
* limitations under the License.
*/
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
#include "tensorrt_llm/runtime/gptJsonConfig.h"
#include "tensorrt_llm/runtime/gptSession.h"
#include "tensorrt_llm/runtime/tllmLogger.h"

#include <NvInfer.h>
#include <NvInferPlugin.h>
#include <chrono>
#include <cxxopts.hpp>
#include <iostream>
Expand All @@ -41,7 +41,10 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
auto const json = GptJsonConfig::parse(dataPath / "config.json");
auto const modelConfig = json.getModelConfig();
auto const inputPacked = modelConfig.usePackedInput();
auto const worldConfig = WorldConfig::mpi(*logger);
SizeType deviceCount{0};
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
auto const worldConfig
= WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName);
auto const dtype = modelConfig.getDataType();
auto const useHalf = (dtype == nvinfer1::DataType::kHALF);
Expand Down Expand Up @@ -233,7 +236,7 @@ int main(int argc, char* argv[])
// Argument: Enable CUDA graph
auto enableCudaGraph = result.count("enable_cuda_graph") > 0;

initLibNvInferPlugins(logger.get(), "tensorrt_llm");
initTrtLlmPlugins(logger.get());

benchmarkGptSession(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inOutLen,
logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>(),
Expand Down
10 changes: 5 additions & 5 deletions benchmarks/README.md → benchmarks/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ multiple GPUs or multiple nodes with multiple GPUs.

## Overview

The benchmark implementation and entrypoint can be found in [`benchmarks/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory:

* [`benchmarks/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
* [`benchmarks/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
* [`benchmarks/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
* [`benchmarks/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
* [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
* [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
* [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.

## Usage

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
# limitations under the License.
from typing import Literal, Optional

from pydantic import BaseModel
from pydantic import BaseModel, Extra

from tensorrt_llm.functional import PositionEmbeddingType


class BuildConfig(BaseModel):
class BuildConfig(BaseModel, extra=Extra.allow):
num_layers: int
num_heads: int
hidden_size: int
Expand All @@ -28,10 +28,10 @@ class BuildConfig(BaseModel):
n_positions: int
max_batch_size: int
max_input_len: int
num_kv_heads: int = None
num_kv_heads: Optional[int] = None
max_output_len: Optional[int] = None
builder_opt: Optional[
int] = None # TRT builder_optimization_level from 0 to 5
# TRT builder_optimization_level from 0 to 5
builder_opt: Optional[int] = None
inter_size: Optional[int] = None
rotary_dim: Optional[int] = None
type_vocab_size: Optional[int] = None
Expand All @@ -44,11 +44,10 @@ class BuildConfig(BaseModel):
enable_context_fmha: bool = True
# None means using the model family's default value defined in the ctor
position_embedding_type: Optional[PositionEmbeddingType] = None
# Only when position embedding is RoPE, this value makes sense, make default value to be None, not 0 or 1
# to prevent misuse
# Only when position embedding is RoPE, this value makes sense, make
# default value to be None, not 0 or 1 to prevent misuse
rotary_pct: Optional[float] = None
bias: bool = True
remove_input_padding: bool = True


class ModelConfig(BaseModel):
Expand Down Expand Up @@ -439,6 +438,89 @@ class ModelConfig(BaseModel):
enable_qk_half_accum=False,
enable_context_fmha=False,
)),
"falcon_rw_1b":
ModelConfig(name="falcon_rw_1b",
family="falcon",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=32,
hidden_size=2048,
vocab_size=50304,
hidden_act=None,
n_positions=2048,
max_batch_size=256,
max_input_len=1024,
max_output_len=1024,
builder_opt=None,
bias=True,
use_alibi=True,
parallel_attention=False,
new_decoder_architecture=False,
)),
"falcon_7b":
ModelConfig(name="falcon_7b",
family="falcon",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=32,
num_heads=71,
num_kv_heads=1,
hidden_size=4544,
vocab_size=65024,
hidden_act=None,
n_positions=2048,
max_batch_size=128,
max_input_len=512,
max_output_len=200,
builder_opt=None,
bias=False,
use_alibi=False,
parallel_attention=True,
new_decoder_architecture=False,
)),
"falcon_40b":
ModelConfig(name="falcon_40b",
family="falcon",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=60,
num_heads=128,
num_kv_heads=8,
hidden_size=8192,
vocab_size=65024,
hidden_act=None,
n_positions=2048,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
bias=False,
use_alibi=False,
parallel_attention=True,
new_decoder_architecture=False,
)),
"falcon_180b":
ModelConfig(name="falcon_180b",
family="falcon",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=80,
num_heads=232,
num_kv_heads=8,
hidden_size=14848,
vocab_size=65024,
hidden_act=None,
n_positions=2048,
max_batch_size=8,
max_input_len=1024,
max_output_len=1024,
builder_opt=None,
bias=False,
use_alibi=False,
parallel_attention=True,
new_decoder_architecture=False,
)),
}


Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
46 changes: 32 additions & 14 deletions benchmarks/gpt_benchmark.py → benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,22 +81,24 @@ def __init__(self,
self.per_token = False
self.per_channel = False

self.use_gpt_attention_plugin = False
self.use_gemm_plugin = False
self.use_layernorm_plugin = False
self.use_rmsnorm_plugin = False
self.use_lookup_plugin = False
is_plugin_mode = mode == 'plugin'
plg_dtype = dtype if is_plugin_mode else False
self.use_gpt_attention_plugin = plg_dtype
self.use_gemm_plugin = plg_dtype
self.use_layernorm_plugin = plg_dtype
# Enable RMS Norm plugin for the LLaMA family.
if is_plugin_mode and 'llama' in model_name:
self.use_rmsnorm_plugin = dtype
else:
self.use_rmsnorm_plugin = False
self.use_lookup_plugin = plg_dtype
self.enable_context_fmha = True
self.quant_mode = QuantMode(0)
if mode == 'plugin':
self.use_gpt_attention_plugin = dtype
self.use_gemm_plugin = dtype
self.use_layernorm_plugin = dtype
self.use_lookup_plugin = dtype
if "llama" in model_name:
self.use_rmsnorm_plugin = dtype
self.remove_input_padding = is_plugin_mode

for key, value in get_build_config(model_name).items():
setattr(self, key, value)

# Override the n_position/max_input_len/max_output_len/max_batch_size to value from cmd line if that's specified.
if n_positions is not None:
assert isinstance(
Expand All @@ -122,6 +124,7 @@ def __init__(self,
self.num_kv_heads = self.num_heads
if kwargs.get('force_num_layer_1', False):
self.num_layers = 1

if self.use_smooth_quant:
self.quant_mode = QuantMode.use_smooth_quant(
self.per_token, self.per_channel)
Expand Down Expand Up @@ -195,7 +198,7 @@ def prepare_inputs(self, config):
input_lengths = torch.tensor([inlen
for _ in range(batch_size)]).int().cuda()

self.decoder.setup(batch_size, inlen, outlen)
self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams)
return (input_ids, input_lengths)

def build(self):
Expand Down Expand Up @@ -334,6 +337,21 @@ def build(self):
world_size=self.world_size,
tp_size=self.world_size), # TP only
use_parallel_embedding=(self.model_name == 'bloom_176b'))
elif family == "falcon":
tensorrt_llm_model = tensorrt_llm.models.FalconForCausalLM(
num_layers=self.num_layers,
num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads,
hidden_size=self.hidden_size,
vocab_size=self.vocab_size,
max_position_embeddings=self.n_positions,
dtype=kv_dtype,
bias=self.bias,
use_alibi=self.use_alibi,
new_decoder_architecture=self.new_decoder_architecture,
parallel_attention=self.parallel_attention,
mapping=tensorrt_llm.Mapping(world_size=self.world_size,
tp_size=self.world_size))
else:
raise Exception(f'Unexpected model: {self.model_name}')

Expand Down Expand Up @@ -429,7 +447,7 @@ def build(self):

def run(self, inputs, config):
batch_size, inlen, outlen = config[0], config[1], config[2]
self.decoder.setup(batch_size, inlen, outlen)
self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams)
if self.remove_input_padding:
self.decoder.decode_batch(inputs[0], self.sampling_config)
else:
Expand Down
File renamed without changes.
Loading