@@ -416,15 +416,16 @@ steps:
416416 - pytest -v -s compile/test_basic_correctness.py
417417 - pytest -v -s compile/piecewise/
418418
419- - label : PyTorch Fullgraph Test # 20min
420- timeout_in_minutes : 30
419+ - label : PyTorch Fullgraph Test # 22min
420+ timeout_in_minutes : 35
421421 mirror_hardwares : [amdexperimental]
422422 torch_nightly : true
423423 source_file_dependencies :
424424 - vllm/
425425 - tests/compile
426426 commands :
427427 - pytest -v -s compile/test_full_graph.py
428+ - pytest -v -s compile/test_fusions_e2e.py
428429
429430- label : Kernels Core Operation Test # 48min
430431 timeout_in_minutes : 75
@@ -807,8 +808,8 @@ steps:
807808 # Whisper needs spawn method to avoid deadlock
808809 - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
809810
810- - label : Blackwell Test # 38 min
811- timeout_in_minutes : 60
811+ - label : Blackwell Test # 21 min
812+ timeout_in_minutes : 30
812813 working_dir : " /vllm-workspace/"
813814 gpu : b200
814815 # optional: true
@@ -821,8 +822,6 @@ steps:
821822 - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
822823 - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
823824 - vllm/v1/attention/backends/flashinfer.py
824- - vllm/compilation/fusion.py
825- - vllm/compilation/fusion_attn.py
826825 commands :
827826 - nvidia-smi
828827 - python3 examples/offline_inference/basic/chat.py
@@ -839,15 +838,32 @@ steps:
839838 - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
840839 - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
841840 - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
841+ - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
842+ - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
842843 - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
843844 - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
844- # Fusion
845- - pytest -v -s tests/compile/test_fusion_all_reduce.py
846- - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
847845 - pytest -v -s tests/kernels/moe/test_flashinfer.py
846+
847+ - label : Blackwell Fusion Tests # 30 min
848+ timeout_in_minutes : 40
849+ working_dir : " /vllm-workspace/"
850+ gpu : b200
851+ source_file_dependencies :
852+ - csrc/quantization/fp4/
853+ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
854+ - vllm/v1/attention/backends/flashinfer.py
855+ - vllm/compilation/
856+ # can affect pattern matching
857+ - vllm/model_executor/layers/layernorm.py
858+ - vllm/model_executor/layers/activation.py
859+ - vllm/model_executor/layers/quantization/input_quant_fp8.py
860+ commands :
861+ - nvidia-smi
862+ - pytest -v -s tests/compile/test_fusion_attn.py
848863 - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
849- - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
850- - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
864+ # this runner has 2 GPUs available even though num_gpus=2 is not set
865+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
866+ - pytest -v -s tests/compile/test_fusions_e2e.py
851867
852868- label : Blackwell GPT-OSS Eval
853869 timeout_in_minutes : 60
@@ -1100,14 +1116,16 @@ steps:
11001116 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
11011117
11021118# #### H200 test #####
1103- - label : Distrubted Tests (H200) # optional
1119+ - label : Distributed Tests (H200) # optional
11041120 gpu : h200
11051121 optional : true
11061122 working_dir : " /vllm-workspace/"
11071123 num_gpus : 2
11081124 commands :
11091125 - pytest -v -s tests/compile/test_async_tp.py
11101126 - pytest -v -s tests/compile/test_sequence_parallelism.py
1127+ - pytest -v -s tests/compile/test_fusion_all_reduce.py
1128+ - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
11111129 - pytest -v -s tests/distributed/test_context_parallel.py
11121130 - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
11131131
0 commit comments