Add triton attn test to attn+quant fusion

ProExpertProg · ProExpertProg · commit d843a67c428a · 2025-10-11T23:30:25.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+import itertools
 
 import pytest
 import torch._dynamo
@@ -99,6 +100,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
         num_blocks = batch_size * max_blocks
         backend = self.attn.backend
 
+        # TODO use get_kv_cache_stride_order
         # Create dummy KV cache for the selected backend
         if backend == _Backend.ROCM_ATTN:
             # k/v as 1st dimention
@@ -240,7 +242,8 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
 MODELS_FP4 = []
 HEADS = []
 SPLIT_ATTENTION = []
-BACKENDS: list[_Backend] = []
+BACKENDS_FP8: list[_Backend] = []
+BACKENDS_FP4: list[_Backend] = []
 
 if current_platform.is_cuda():
     MODELS_FP8 = [
@@ -251,10 +254,11 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
     ]
     HEADS = [(64, 8), (40, 8)]
     SPLIT_ATTENTION = [False]
-    BACKENDS = []  # TODO [_Backend.TRITON_ATTN]
+    BACKENDS_FP8 = [_Backend.TRITON_ATTN]
 
     if current_platform.is_device_capability((10, 0)):
-        BACKENDS += [_Backend.FLASHINFER]
+        BACKENDS_FP8 += [_Backend.FLASHINFER]
+        BACKENDS_FP4 += [_Backend.FLASHINFER]
         MODELS_FP4 += [
             (
                 "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
@@ -288,13 +292,12 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
 )
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize(
-    "model_name, model_class, custom_ops",
+    "backend, model, custom_ops",
     # Test attention+quant_fp8 fusion with custom and torch impls
-    [(*model, c) for model in MODELS_FP8 for c in ["+quant_fp8", "-quant_fp8"]]
+    list(itertools.product(BACKENDS_FP8, MODELS_FP8, ["+quant_fp8", "-quant_fp8"]))
     # quant_fp4 only has the custom impl
-    + [(*model, c) for model in MODELS_FP4 for c in [""]],
+    + list(itertools.product(BACKENDS_FP4, MODELS_FP4, [""])),
 )
-@pytest.mark.parametrize("backend", BACKENDS)
 @pytest.mark.parametrize("use_inductor_graph_partition", USE_INDUCTOR_GRAPH_PARTITION)
 @pytest.mark.skipif(
     not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA"
@@ -307,8 +310,7 @@ def test_attention_quant_pattern(
     batch_size: int,
     dtype: torch.dtype,
     custom_ops: str,
-    model_name: str,
-    model_class: type[AttentionQuantPatternModel],
+    model: tuple[str, type[AttentionQuantPatternModel]],
     backend: _Backend,
     use_inductor_graph_partition: bool,
     dist_init,
@@ -317,6 +319,7 @@ def test_attention_quant_pattern(
     """Test AttentionStaticQuantPattern fusion pass"""
 
     custom_ops_list = custom_ops.split(",") if custom_ops else []
+    model_name, model_class = model
 
     device = torch.device("cuda:0")
     torch.manual_seed(42)