Skip to content

Commit 279e329

Browse files
authored
Merge pull request #3 from NVIDIA/kaiyu/update
Update TRT-LLM code
2 parents 9b563ba + 6111f52 commit 279e329

File tree

318 files changed

+15935
-6292
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

318 files changed

+15935
-6292
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,6 @@
99
path = 3rdparty/cxxopts
1010
url = https://github.com/jarro2783/cxxopts
1111
branch = v3.1.1
12+
[submodule "3rdparty/NVTX"]
13+
path = 3rdparty/NVTX
14+
url = https://github.com/NVIDIA/NVTX.git

3rdparty/NVTX

Submodule NVTX added at a1ceb06
File renamed without changes.

cpp/benchmarks/README.md renamed to benchmarks/cpp/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ multiple GPUs or multiple nodes with multiple GPUs.
77

88
### 1. Build TensorRT-LLM and benchmarking source code
99

10-
Please follow the [`installation document`](../../README.md) to build TensorRT-LLM.
10+
Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.
1111

1212
After that, you can build benchmarking source code for C++ runtime
1313
```
@@ -19,7 +19,7 @@ make -j benchmarks
1919

2020
Before you launch C++ benchmarking, please make sure that you have already built engine(s) using TensorRT-LLM API, C++ benchmarking code cannot generate engine(s) for you.
2121

22-
You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../../benchmarks/README.md).
22+
You can reuse the engine built by benchmarking code for Python Runtime, please see that [`document`](../python/README.md).
2323

2424
For detailed usage, you can do the following
2525
```

cpp/benchmarks/bertBenchmark.cpp renamed to benchmarks/cpp/bertBenchmark.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@
1515
* limitations under the License.
1616
*/
1717
#include "tensorrt_llm/common/memoryUtils.h"
18+
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
1819
#include "tensorrt_llm/runtime/iTensor.h"
1920
#include "tensorrt_llm/runtime/tllmLogger.h"
2021
#include "tensorrt_llm/runtime/tllmRuntime.h"
2122
#include "tensorrt_llm/runtime/worldConfig.h"
2223

2324
#include <NvInfer.h>
24-
#include <NvInferPlugin.h>
2525
#include <chrono>
2626
#include <cxxopts.hpp>
2727
#include <filesystem>
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
228228
{
229229
throw std::invalid_argument("Unexpected log level: " + logLevel);
230230
}
231-
initLibNvInferPlugins(logger.get(), "tensorrt_llm");
231+
initTrtLlmPlugins(logger.get());
232232

233233
benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens, logger,
234234
result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());

cpp/benchmarks/gptSessionBenchmark.cpp renamed to benchmarks/cpp/gptSessionBenchmark.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515
* limitations under the License.
1616
*/
1717
#include "tensorrt_llm/common/memoryUtils.h"
18+
#include "tensorrt_llm/plugins/api/tllmPlugin.h"
1819
#include "tensorrt_llm/runtime/gptJsonConfig.h"
1920
#include "tensorrt_llm/runtime/gptSession.h"
2021
#include "tensorrt_llm/runtime/tllmLogger.h"
2122

2223
#include <NvInfer.h>
23-
#include <NvInferPlugin.h>
2424
#include <chrono>
2525
#include <cxxopts.hpp>
2626
#include <iostream>
@@ -41,7 +41,10 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
4141
auto const json = GptJsonConfig::parse(dataPath / "config.json");
4242
auto const modelConfig = json.getModelConfig();
4343
auto const inputPacked = modelConfig.usePackedInput();
44-
auto const worldConfig = WorldConfig::mpi(*logger);
44+
SizeType deviceCount{0};
45+
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
46+
auto const worldConfig
47+
= WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
4548
auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName);
4649
auto const dtype = modelConfig.getDataType();
4750
auto const useHalf = (dtype == nvinfer1::DataType::kHALF);
@@ -233,7 +236,7 @@ int main(int argc, char* argv[])
233236
// Argument: Enable CUDA graph
234237
auto enableCudaGraph = result.count("enable_cuda_graph") > 0;
235238

236-
initLibNvInferPlugins(logger.get(), "tensorrt_llm");
239+
initTrtLlmPlugins(logger.get());
237240

238241
benchmarkGptSession(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inOutLen,
239242
logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>(),

benchmarks/README.md renamed to benchmarks/python/README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ multiple GPUs or multiple nodes with multiple GPUs.
55

66
## Overview
77

8-
The benchmark implementation and entrypoint can be found in [`benchmarks/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
8+
The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
99

10-
* [`benchmarks/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
11-
* [`benchmarks/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
12-
* [`benchmarks/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
13-
* [`benchmarks/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
10+
* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
11+
* [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
12+
* [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
13+
* [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
1414

1515
## Usage
1616

benchmarks/allowed_configs.py renamed to benchmarks/python/allowed_configs.py

Lines changed: 90 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
# limitations under the License.
1515
from typing import Literal, Optional
1616

17-
from pydantic import BaseModel
17+
from pydantic import BaseModel, Extra
1818

1919
from tensorrt_llm.functional import PositionEmbeddingType
2020

2121

22-
class BuildConfig(BaseModel):
22+
class BuildConfig(BaseModel, extra=Extra.allow):
2323
num_layers: int
2424
num_heads: int
2525
hidden_size: int
@@ -28,10 +28,10 @@ class BuildConfig(BaseModel):
2828
n_positions: int
2929
max_batch_size: int
3030
max_input_len: int
31-
num_kv_heads: int = None
31+
num_kv_heads: Optional[int] = None
3232
max_output_len: Optional[int] = None
33-
builder_opt: Optional[
34-
int] = None # TRT builder_optimization_level from 0 to 5
33+
# TRT builder_optimization_level from 0 to 5
34+
builder_opt: Optional[int] = None
3535
inter_size: Optional[int] = None
3636
rotary_dim: Optional[int] = None
3737
type_vocab_size: Optional[int] = None
@@ -44,11 +44,10 @@ class BuildConfig(BaseModel):
4444
enable_context_fmha: bool = True
4545
# None means using the model family's default value defined in the ctor
4646
position_embedding_type: Optional[PositionEmbeddingType] = None
47-
# Only when position embedding is RoPE, this value makes sense, make default value to be None, not 0 or 1
48-
# to prevent misuse
47+
# Only when position embedding is RoPE, this value makes sense, make
48+
# default value to be None, not 0 or 1 to prevent misuse
4949
rotary_pct: Optional[float] = None
5050
bias: bool = True
51-
remove_input_padding: bool = True
5251

5352

5453
class ModelConfig(BaseModel):
@@ -439,6 +438,89 @@ class ModelConfig(BaseModel):
439438
enable_qk_half_accum=False,
440439
enable_context_fmha=False,
441440
)),
441+
"falcon_rw_1b":
442+
ModelConfig(name="falcon_rw_1b",
443+
family="falcon",
444+
benchmark_type="gpt",
445+
build_config=BuildConfig(
446+
num_layers=24,
447+
num_heads=32,
448+
hidden_size=2048,
449+
vocab_size=50304,
450+
hidden_act=None,
451+
n_positions=2048,
452+
max_batch_size=256,
453+
max_input_len=1024,
454+
max_output_len=1024,
455+
builder_opt=None,
456+
bias=True,
457+
use_alibi=True,
458+
parallel_attention=False,
459+
new_decoder_architecture=False,
460+
)),
461+
"falcon_7b":
462+
ModelConfig(name="falcon_7b",
463+
family="falcon",
464+
benchmark_type="gpt",
465+
build_config=BuildConfig(
466+
num_layers=32,
467+
num_heads=71,
468+
num_kv_heads=1,
469+
hidden_size=4544,
470+
vocab_size=65024,
471+
hidden_act=None,
472+
n_positions=2048,
473+
max_batch_size=128,
474+
max_input_len=512,
475+
max_output_len=200,
476+
builder_opt=None,
477+
bias=False,
478+
use_alibi=False,
479+
parallel_attention=True,
480+
new_decoder_architecture=False,
481+
)),
482+
"falcon_40b":
483+
ModelConfig(name="falcon_40b",
484+
family="falcon",
485+
benchmark_type="gpt",
486+
build_config=BuildConfig(
487+
num_layers=60,
488+
num_heads=128,
489+
num_kv_heads=8,
490+
hidden_size=8192,
491+
vocab_size=65024,
492+
hidden_act=None,
493+
n_positions=2048,
494+
max_batch_size=64,
495+
max_input_len=512,
496+
max_output_len=200,
497+
builder_opt=None,
498+
bias=False,
499+
use_alibi=False,
500+
parallel_attention=True,
501+
new_decoder_architecture=False,
502+
)),
503+
"falcon_180b":
504+
ModelConfig(name="falcon_180b",
505+
family="falcon",
506+
benchmark_type="gpt",
507+
build_config=BuildConfig(
508+
num_layers=80,
509+
num_heads=232,
510+
num_kv_heads=8,
511+
hidden_size=14848,
512+
vocab_size=65024,
513+
hidden_act=None,
514+
n_positions=2048,
515+
max_batch_size=8,
516+
max_input_len=1024,
517+
max_output_len=1024,
518+
builder_opt=None,
519+
bias=False,
520+
use_alibi=False,
521+
parallel_attention=True,
522+
new_decoder_architecture=False,
523+
)),
442524
}
443525

444526

File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)