update test config

crazydemo · crazydemo · commit 590fb23930af · 2025-07-28T23:49:30.000+08:00
Signed-off-by: Ivy Zhang &lt;25222398+crazydemo@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -21,10 +21,10 @@ meta-llama/Llama-4-Scout-17B-16E-Instruct:
   - accuracy: 89.70
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    accuracy: 0.00
+    accuracy: 89.61
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
-    accuracy: 0.00
+    accuracy: 89.00
 deepseek-ai/DeepSeek-V3-Lite:
   - accuracy: 64.74
   - quant_algo: NVFP4
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -73,10 +73,10 @@ meta-llama/Llama-4-Scout-17B-16E-Instruct:
   - accuracy: 80.00
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    accuracy: 0.00
+    accuracy: 80.00
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
-    accuracy: 0.00
+    accuracy: 80.00
 mistralai/Mistral-7B-v0.1:
   - accuracy: 66
 mistralai/Mistral-7B-Instruct-v0.3:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -486,10 +486,10 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
             task.evaluate(llm)
 
     @skip_pre_hopper
-    @pytest.mark.skip_less_mpi_world_size(8)
+    @pytest.mark.skip_less_mpi_world_size(4)
     @parametrize_with_ids("cuda_graph", [True])
-    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
-                             ids=["tp8ep8", "tp4"])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4), (4, 1, 1)],
+                             ids=["tp4ep4", "tp4"])
     def test_fp8_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
         model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
         with LLM(
@@ -499,8 +499,7 @@ def test_fp8_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
                 max_seq_len=8192,
                 pipeline_parallel_size=pp_size,
                 moe_expert_parallel_size=ep_size,
-                cuda_graph_config=CudaGraphConfig()
-                if cuda_graph else None) as llm:
+                use_cuda_graph=cuda_graph) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
             assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = MMLU(self.MODEL_NAME)
@@ -509,14 +508,15 @@ def test_fp8_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
             task.evaluate(llm)
 
     @skip_pre_hopper
-    @pytest.mark.skip_less_mpi_world_size(8)
+    @pytest.mark.skip_less_mpi_world_size(4)
     @parametrize_with_ids("cuda_graph", [True])
-    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8)],
-                             ids=["tp8ep8"])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
+                             ids=["tp4ep4"])
     def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
         with LLM(
                 f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
                 tensor_parallel_size=tp_size,
+                max_seq_len=22000,
                 pipeline_parallel_size=pp_size,
                 moe_expert_parallel_size=ep_size,
                 enable_chunked_prefill=True,
@@ -530,10 +530,10 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
             task.evaluate(llm)
 
     @skip_pre_blackwell
-    @pytest.mark.skip_less_mpi_world_size(8)
+    @pytest.mark.skip_less_mpi_world_size(4)
     @parametrize_with_ids("cuda_graph", [True])
-    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
-                             ids=["tp8ep8", "tp4"])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4), (4, 1, 1)],
+                             ids=["tp4ep4", "tp4"])
     def test_fp4_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
         model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
         with LLM(
@@ -554,14 +554,15 @@ def test_fp4_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
     @skip_pre_blackwell
     @pytest.mark.skip_less_mpi_world_size(8)
     @parametrize_with_ids("cuda_graph", [True])
-    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8)],
-                             ids=["tp8ep8"])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 4)],
+                             ids=["tp4ep4"])
     def test_fp4_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
         with LLM(
                 f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4",
                 tensor_parallel_size=tp_size,
                 pipeline_parallel_size=pp_size,
                 moe_expert_parallel_size=ep_size,
+                max_seq_len=22000,
                 enable_chunked_prefill=True,
                 max_num_tokens=256,
                 use_cuda_graph=cuda_graph) as llm:
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -462,12 +462,12 @@ accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_p
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
-accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_prequantized[tp8ep8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_prequantized[tp4ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_prequantized[tp4-cuda_graph=True]
-accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=False]
-accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True]
-accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_prequantized[tp8ep8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_prefill[tp4ep4-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_prequantized[tp4ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_prequantized[tp4-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]