Cleanup test_fusion_attn.py

ProExpertProg · ProExpertProg · commit f6429e416de6 · 2025-10-15T02:40:43.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -34,6 +34,7 @@
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer
+from vllm.utils.flashinfer import has_flashinfer
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -238,52 +239,41 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
         )
 
 
-MODELS_FP8 = []
-MODELS_FP4 = []
-HEADS = []
-SPLIT_ATTENTION = []
+MODELS_FP8: list[tuple[str, type]] = []
+MODELS_FP4: list[tuple[str, type]] = []
+HEADS: list[tuple[int, int]] = []
+SPLIT_ATTENTION: list[bool] = []
 BACKENDS_FP8: list[_Backend] = []
 BACKENDS_FP4: list[_Backend] = []
 
 if current_platform.is_cuda():
+    HEADS = [(64, 8), (40, 8)]
     MODELS_FP8 = [
         (
             "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
             TestAttentionFp8StaticQuantPatternModel,
         )
     ]
-    HEADS = [(64, 8), (40, 8)]
-    SPLIT_ATTENTION = [False]
-    BACKENDS_FP8 = [_Backend.TRITON_ATTN]
-
-    if current_platform.is_device_capability((10, 0)):
-        BACKENDS_FP8 += [_Backend.FLASHINFER]
-        BACKENDS_FP4 += [_Backend.FLASHINFER]
-        MODELS_FP4 += [
-            (
-                "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
-                TestAttentionNvfp4QuantPatternModel,
-            )
-        ]
+    MODELS_FP4 = [
+        (
+            "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
+            TestAttentionNvfp4QuantPatternModel,
+        )
+    ]
+    BACKENDS_FP8 = [_Backend.TRITON_ATTN, _Backend.FLASHINFER]
+    BACKENDS_FP4 = [_Backend.FLASHINFER]
 
 elif current_platform.is_rocm():
+    HEADS = [(32, 8), (40, 8)]
     MODELS_FP8 = [
         ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
     ]
-    HEADS = [(32, 8), (40, 8)]
-    SPLIT_ATTENTION = [False, True]
     BACKENDS = [
-        _Backend.TRITON_ATTN,
         _Backend.ROCM_AITER_UNIFIED_ATTN,
         _Backend.ROCM_ATTN,
+        _Backend.TRITON_ATTN,
     ]
 
-# TODO(boyuan/luka): test inductor graph partition on rocm
-if is_torch_equal_or_newer("2.9.0.dev") and current_platform.is_cuda():
-    USE_INDUCTOR_GRAPH_PARTITION = [False, True]
-else:
-    USE_INDUCTOR_GRAPH_PARTITION = [False]
-
 
 @pytest.mark.parametrize("num_qo_heads, num_kv_heads", HEADS)
 @pytest.mark.parametrize("head_size", [128])
@@ -298,7 +288,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
     # quant_fp4 only has the custom impl
     + list(flat_product(BACKENDS_FP4, MODELS_FP4, [""])),
 )
-@pytest.mark.parametrize("use_inductor_graph_partition", USE_INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
 @pytest.mark.skipif(
     not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA"
 )
@@ -318,6 +308,14 @@ def test_attention_quant_pattern(
     caplog_vllm,
 ):
     """Test AttentionStaticQuantPattern fusion pass"""
+    if backend == _Backend.FLASHINFER and (
+        not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
+    ):
+        pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
+
+    # TODO(boyuan/luka): test inductor graph partition on rocm
+    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
 
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 
@@ -435,7 +433,7 @@ def test_attention_quant_pattern(
     )
 
     # access the underlying `AttnFusionPass` on the `LazyInitPass`
-    assert attn_pass.pass_.matched_count == 1
+    assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
 
     # Check attention ops in the graph before and after fusion
     attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass))