Skip to content

Commit 816a120

Browse files
authored
[TRTLLM-6991][chore] add DeepSeek-R1 FP8 accuracy tests on Blackwell (#6710)
Signed-off-by: Fanrong Li <[email protected]>
1 parent 2bb90ba commit 816a120

File tree

6 files changed

+23
-7
lines changed

6 files changed

+23
-7
lines changed

jenkins/L0_Test.groovy

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,12 +1894,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18941894

18951895
multiNodesSBSAConfigs = [
18961896
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
1897-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 6, 8, 2],
1898-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 6, 8, 2],
1899-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 6, 8, 2],
1900-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 6, 8, 2],
1901-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 6, 8, 2],
1902-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-6": ["gb200-multi-node", "l0_gb200_multi_nodes", 6, 6, 8, 2],
1897+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 7, 8, 2],
1898+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 7, 8, 2],
1899+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 7, 8, 2],
1900+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 7, 8, 2],
1901+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 7, 8, 2],
1902+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-6": ["gb200-multi-node", "l0_gb200_multi_nodes", 6, 7, 8, 2],
1903+
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-7": ["gb200-multi-node", "l0_gb200_multi_nodes", 7, 7, 8, 2],
19031904
]
19041905
fullSet += multiNodesSBSAConfigs.keySet()
19051906

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ deepseek-ai/DeepSeek-R1:
7474
- quant_algo: FP8_BLOCK_SCALES
7575
spec_dec_algo: MTP
7676
accuracy: 95.413
77+
- quant_algo: FP8_BLOCK_SCALES
78+
kv_cache_quant_algo: FP8
79+
accuracy: 95.413
7780
Qwen3/Qwen3-8B:
7881
- accuracy: 87.1114
7982
- quant_algo: FP8

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ deepseek-ai/DeepSeek-R1:
163163
- quant_algo: FP8_BLOCK_SCALES
164164
spec_dec_algo: MTP
165165
accuracy: 87.573
166+
- quant_algo: FP8_BLOCK_SCALES
167+
kv_cache_quant_algo: FP8
168+
accuracy: 87.573
166169
Qwen3/Qwen3-8B:
167170
- quant_algo: W4A8_MXFP4_FP8
168171
accuracy: 72.70

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1663,10 +1663,17 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
16631663
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
16641664
attention_dp, cuda_graph, overlap_scheduler,
16651665
max_batch_size):
1666-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
1666+
if get_sm_version() == 100:
1667+
moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384)
1668+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
1669+
else:
1670+
moe_config = MoeConfig()
1671+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
1672+
16671673
pytorch_config = dict(
16681674
disable_overlap_scheduler=not overlap_scheduler,
16691675
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
1676+
moe_config=moe_config,
16701677
)
16711678

16721679
if fp8kv:

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_
498498
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
499499
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
500500
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
501+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput]
501502
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
502503
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
503504
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True]

tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ l0_gb200_multi_nodes:
1616
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
1717
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
1818
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
19+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
1920
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
2021
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
2122
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)

0 commit comments

Comments
 (0)