Skip to content

Commit f771e6e

Browse files
ruodillancelly
authored andcommitted
test:[nvbug 5415268] add kv_cache_free_gpu_mem_fraction param and llama4 rcca cases (NVIDIA#6430)
Signed-off-by: ruodil <[email protected]> Signed-off-by: Lanyu Liao <[email protected]>
1 parent af892b8 commit f771e6e

File tree

2 files changed

+26
-11
lines changed

2 files changed

+26
-11
lines changed

tests/integration/defs/perf/test_perf.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ def __init__(
375375
tp_size: int = 1,
376376
pp_size: int = 1,
377377
num_gpus: int = 1,
378+
kv_cache_free_gpu_mem_fraction: float = 0.9,
378379
):
379380
# The model name.
380381
self.model_name = model_name
@@ -428,6 +429,8 @@ def __init__(
428429
self.num_gpus = num_gpus
429430
# Just build engines
430431
self.build_only = False
432+
# kv cache free gpu mem fraction
433+
self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction
431434

432435
def to_string(self,
433436
custom_bs: int = None,
@@ -541,6 +544,10 @@ def to_string(self,
541544
if self.num_gpus > 1:
542545
entries.append(f"gpus:{self.num_gpus}")
543546

547+
# Add kv cache free gpu mem fraction.
548+
if self.kv_cache_free_gpu_mem_fraction != 0.9:
549+
entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}")
550+
544551
# Concatenate labels with "-".
545552
return "-".join(entries)
546553

@@ -648,6 +655,11 @@ def load_from_str(self, test_param_labels) -> None:
648655
self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
649656
labels.pop(0).replace("gpus:", ""))
650657

658+
if len(labels) > 0:
659+
self.kv_cache_free_gpu_mem_fraction = 0.9 if not labels[
660+
0].startswith("kv_frac:") else float(
661+
labels.pop(0).replace("kv_frac:", ""))
662+
651663
assert len(
652664
labels
653665
) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"
@@ -998,7 +1010,8 @@ def get_trtllm_bench_build_command(self, engine_dir) -> list:
9981010
f"--workspace={engine_dir}", f"--model={hf_model_name}",
9991011
f"--model_path={model_dir}", "build", f"--dataset={dataset_path}",
10001012
f"--tp_size={self._config.tp_size}",
1001-
f"--pp_size={self._config.pp_size}"
1013+
f"--pp_size={self._config.pp_size}",
1014+
f"--kv_cache_free_gpu_mem_fraction={self._config.kv_cache_free_gpu_mem_fraction}"
10021015
]
10031016
max_seq_len = max(self._config.input_lens) + max(
10041017
self._config.output_lens)

tests/integration/test_lists/qa/trt_llm_release_perf_test.yml

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -473,19 +473,21 @@ trt_llm_release_perf_test:
473473

474474
#llama_v4_maverick_17b_128e_instruct_fp8
475475
#pytorch backend
476-
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-tp:8-gpus:8]
477-
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
478-
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
479-
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
480-
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:8]
476+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
477+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
478+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
479+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8-kv_frac:0.6]
480+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8-kv_frac:0.6]
481+
#rcca case
482+
- perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:20000,2000-reqs:1000-ep:8-tp:8-gpus:8-kv_frac:0.6]
481483

482484
#llama_v4_scout_17b_16e_instruct_fp8
483485
#pytorch backend
484-
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-tp:8-gpus:8]
485-
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
486-
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
487-
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8]
488-
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8]
486+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:2000,500-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
487+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:500,2000-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
488+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8-kv_frac:0.6]
489+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8-kv_frac:0.6]
490+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-ep:8-tp:8-gpus:8-kv_frac:0.6]
489491

490492
#deepseek_r1_fp8
491493
#pytorch backend

0 commit comments

Comments
 (0)