Skip to content

Commit 55d68d2

Browse files
committed
org perf cases and add cases in perflab to qa test list
Signed-off-by: ruodil <[email protected]>
1 parent 7ceb2f9 commit 55d68d2

File tree

4 files changed

+74
-12
lines changed

4 files changed

+74
-12
lines changed

tests/integration/defs/perf/pytorch_model_config.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ def get_model_yaml_config(model_label: str,
5656
# DeepSeek R1 models with MTP speculative decoding
5757
{
5858
'patterns': [
59-
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8',
60-
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8'
59+
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-reqs:10-ep:4-gpus:8',
60+
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8'
6161
],
6262
'config': {
6363
'enable_attention_dp': True,
@@ -71,8 +71,8 @@ def get_model_yaml_config(model_label: str,
7171
# DeepSeek R1 models with large batch sizes and cuda graph padding
7272
{
7373
'patterns': [
74-
'deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8',
75-
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8'
74+
'deepseek_r1_fp8-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-gpus:8',
75+
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-gpus:8'
7676
],
7777
'config': {
7878
'enable_attention_dp': True,
@@ -85,7 +85,7 @@ def get_model_yaml_config(model_label: str,
8585
# DeepSeek R1 model with specific batch size 128
8686
{
8787
'patterns':
88-
'deepseek_r1-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-quant:fp8-reqs:5120-con:1024-ep:8-gpus:8',
88+
'deepseek_r1_fp8-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-gpus:8',
8989
'config': {
9090
'enable_attention_dp': True,
9191
'cuda_graph_config': {
@@ -154,6 +154,9 @@ def get_model_yaml_config(model_label: str,
154154
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:2000,500-gpus:4',
155155
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4',
156156
'llama_v3.3_70b_instruct_fp8-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:512,32-gpus:4',
157+
'llama_v3.1_405b_instruct_fp4',
158+
'llama_v4_scout_17b_16e_instruct_fp4',
159+
'llama_v4_maverick_17b_128e_instruct_fp8'
157160
],
158161
'config': {
159162
'use_cuda_graph':

tests/integration/defs/perf/test_perf.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@
5555
"llama_v3.3_70b_instruct_fp4":
5656
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
5757
"llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
58-
"llama_v3.1_405b_instruct_fp8": "llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
58+
"llama_v3.1_405b_instruct_fp8":
59+
"llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
5960
"llama_v3.1_405b_instruct_fp4":
6061
"modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
6162
"llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
@@ -73,11 +74,13 @@
7374
"llama_v4_scout_17b_16e_instruct":
7475
"llama4-models/Llama-4-Scout-17B-16E-Instruct",
7576
"llama_v4_scout_17b_16e_instruct_fp8":
77+
"llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
78+
"llama_v4_scout_17b_16e_instruct_fp4":
79+
"llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4",
7680
"llama_v4_maverick_17b_128e_instruct":
7781
"llama4-models/Llama-4-Maverick-17B-128E-Instruct",
7882
"llama_v4_maverick_17b_128e_instruct_fp8":
79-
"modelopt-hf-model-hub/Llama-4-Maverick-17B-128E-Instruct-FP8",
80-
# "llama_30b": "llama-models/llama-30b-hf",
83+
"llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
8184
"mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
8285
"mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
8386
"mixtral_8x7b_v0.1_instruct_fp8": "Mixtral-8x7B-Instruct-v0.1-fp8",
@@ -1223,6 +1226,8 @@ def get_trtllm_bench_command(self, engine_dir):
12231226
model_name = self._config.model_name
12241227
dataset_path = os.path.join(engine_dir, "synthetic_data.json")
12251228
report_path = os.path.join(engine_dir, "report.json")
1229+
pytorch_config_path = os.path.join(engine_dir,
1230+
"extra-llm-api-config.yml")
12261231
if not model_name.endswith("_hf"):
12271232
model_name = model_name + "_hf"
12281233
hf_model_name = HF_MODEL_PATH.get(model_name, "")
@@ -1262,11 +1267,9 @@ def get_trtllm_bench_command(self, engine_dir):
12621267
config = get_model_yaml_config(self._config.to_string(),
12631268
lora_dirs=self.lora_dirs)
12641269
print_info(f"pytorch model config: {config}")
1265-
with open('extra-llm-api-config.yml', 'w') as f:
1270+
with open(pytorch_config_path, 'w') as f:
12661271
yaml.dump(config, f, default_flow_style=False)
1267-
benchmark_cmd += [
1268-
f"--extra_llm_api_options=extra-llm-api-config.yml"
1269-
]
1272+
benchmark_cmd += [f"--extra_llm_api_options={pytorch_config_path}"]
12701273
return benchmark_cmd
12711274

12721275
def get_gpt_manager_runtime_benchmark_command(self, engine_dir, bs,

tests/integration/test_lists/qa/trt_llm_release_perf_cluster_test.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,33 @@ trt_llm_release_perf_cluster_test:
5353
tests:
5454
#- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:8]
5555
#- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:8]
56+
#llama_v3.3_nemotron_super_49b
5657
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:8]
5758
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-con:250-gpus:8]
59+
#llama_v3.3_70b_instruct_fp4
60+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
61+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
62+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
63+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
64+
#llama_v3.1_405b_instruct_fp4
65+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
66+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
67+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
68+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
69+
#llama_v4_scout_17b_16e_instruct_fp4
70+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
71+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
72+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
73+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
74+
#mixtral_8x22b_v0.1
5875
- perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:8]
5976
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
6077
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8]
78+
#deepseek_r1_fp8
6179
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
6280
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
6381
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
82+
#deepseek_r1_nvfp4
6483
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
6584
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
6685
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test

tests/integration/test_lists/qa/trt_llm_release_perf_test.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,33 @@ trt_llm_release_perf_test:
463463
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:500,2000-reqs:8-con:1-tp:8-gpus:8]
464464
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-maxnt:5000-input_output_len:5000,500-reqs:250-con:250-tp:8-gpus:8]
465465
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b_fp8-bench-pytorch-float8-maxbs:256-input_output_len:500,2000-reqs:250-con:250-tp:8-gpus:8]
466+
# llama_v3.1_405b_fp8
467+
#pytorch backend
468+
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2000,500-reqs:8-con:1-tp:8-gpus:8]
469+
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
470+
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
471+
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
472+
- perf/test_perf.py::test_perf[llama_v3.1_405b_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8]
473+
474+
#llama_v4_maverick_17b_128e_instruct_fp8
475+
#pytorch backend
476+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-tp:8-gpus:8]
477+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
478+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
479+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
480+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:8]
481+
482+
#llama_v4_scout_17b_16e_instruct_fp8
483+
#pytorch backend
484+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-tp:8-gpus:8]
485+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
486+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
487+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
488+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8]
489+
490+
#deepseek_r1_fp8
491+
#pytorch backend
492+
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8]
466493

467494

468495
- condition:
@@ -516,6 +543,7 @@ trt_llm_release_perf_test:
516543
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,200-gpus:8]
517544
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
518545
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,200-gpus:8]
546+
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:8]
519547
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:8]
520548

521549

@@ -529,18 +557,27 @@ trt_llm_release_perf_test:
529557
- '*6000*'
530558
linux_distribution_name: '*'
531559
tests:
560+
#llama_v3.1_8b
532561
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4]
533562
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
534563
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8]
535564
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128]
536565
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8]
566+
#llama_v3.1_70b
537567
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
538568
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
539569
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8]
570+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:2-gpus:2]
571+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8-tp:2-gpus:2]
572+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:500,2000-tp:2-gpus:2]
573+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:1000,1000-tp:2-gpus:2]
574+
#llama_v3.3_nemotron_super_49b
540575
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2]
576+
#deepseek_v3_lite
541577
- perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float4-input_output_len:128,128]
542578
- perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-streaming-float4-input_output_len:128,128]
543579
- perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128]
580+
#mixtral_8x7b_v0.1
544581
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-tp:2-gpus:2]
545582
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:2-gpus:2]
546583
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:2-gpus:2]

0 commit comments

Comments
 (0)