@@ -486,10 +486,10 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
486486 task .evaluate (llm )
487487
488488 @skip_pre_hopper
489- @pytest .mark .skip_less_mpi_world_size (8 )
489+ @pytest .mark .skip_less_mpi_world_size (4 )
490490 @parametrize_with_ids ("cuda_graph" , [True ])
491- @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 8 ), (4 , 1 , 1 )],
492- ids = ["tp8ep8 " , "tp4" ])
491+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(4 , 1 , 4 ), (4 , 1 , 1 )],
492+ ids = ["tp4ep4 " , "tp4" ])
493493 def test_fp8_prequantized (self , cuda_graph , tp_size , pp_size , ep_size ):
494494 model_path = f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
495495 with LLM (
@@ -499,8 +499,7 @@ def test_fp8_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
499499 max_seq_len = 8192 ,
500500 pipeline_parallel_size = pp_size ,
501501 moe_expert_parallel_size = ep_size ,
502- cuda_graph_config = CudaGraphConfig ()
503- if cuda_graph else None ) as llm :
502+ use_cuda_graph = cuda_graph ) as llm :
504503 assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
505504 assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
506505 task = MMLU (self .MODEL_NAME )
@@ -509,14 +508,15 @@ def test_fp8_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
509508 task .evaluate (llm )
510509
511510 @skip_pre_hopper
512- @pytest .mark .skip_less_mpi_world_size (8 )
511+ @pytest .mark .skip_less_mpi_world_size (4 )
513512 @parametrize_with_ids ("cuda_graph" , [True ])
514- @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 8 )],
515- ids = ["tp8ep8 " ])
513+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(4 , 1 , 4 )],
514+ ids = ["tp4ep4 " ])
516515 def test_fp8_chunked_prefill (self , cuda_graph , tp_size , pp_size , ep_size ):
517516 with LLM (
518517 f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8" ,
519518 tensor_parallel_size = tp_size ,
519+ max_seq_len = 22000 ,
520520 pipeline_parallel_size = pp_size ,
521521 moe_expert_parallel_size = ep_size ,
522522 enable_chunked_prefill = True ,
@@ -530,10 +530,10 @@ def test_fp8_chunked_prefill(self, cuda_graph, tp_size, pp_size, ep_size):
530530 task .evaluate (llm )
531531
532532 @skip_pre_blackwell
533- @pytest .mark .skip_less_mpi_world_size (8 )
533+ @pytest .mark .skip_less_mpi_world_size (4 )
534534 @parametrize_with_ids ("cuda_graph" , [True ])
535- @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 8 ), (4 , 1 , 1 )],
536- ids = ["tp8ep8 " , "tp4" ])
535+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(4 , 1 , 4 ), (4 , 1 , 1 )],
536+ ids = ["tp4ep4 " , "tp4" ])
537537 def test_fp4_prequantized (self , cuda_graph , tp_size , pp_size , ep_size ):
538538 model_path = f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
539539 with LLM (
@@ -554,14 +554,15 @@ def test_fp4_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
554554 @skip_pre_blackwell
555555 @pytest .mark .skip_less_mpi_world_size (8 )
556556 @parametrize_with_ids ("cuda_graph" , [True ])
557- @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 8 )],
558- ids = ["tp8ep8 " ])
557+ @pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(4 , 1 , 4 )],
558+ ids = ["tp4ep4 " ])
559559 def test_fp4_chunked_prefill (self , cuda_graph , tp_size , pp_size , ep_size ):
560560 with LLM (
561561 f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4" ,
562562 tensor_parallel_size = tp_size ,
563563 pipeline_parallel_size = pp_size ,
564564 moe_expert_parallel_size = ep_size ,
565+ max_seq_len = 22000 ,
565566 enable_chunked_prefill = True ,
566567 max_num_tokens = 256 ,
567568 use_cuda_graph = cuda_graph ) as llm :
0 commit comments