TEMP Mostly working

ProExpertProg · ProExpertProg · commit f3b4cf190736 · 2025-10-11T19:42:50.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
@@ -8,6 +8,7 @@
 from vllm.compilation.fusion import (
     FUSED_OPS,
     QUANT_OPS,
+    RMS_OP,
     FusedRMSQuantKey,
     RMSNormQuantFusionPass,
 )
@@ -65,6 +66,9 @@ def __init__(
                 act_quant_group_shape=group_shape,
             )
 
+        self.enable_rms_norm = self.norm[0].enabled()
+        self.enable_quant_fp8 = self.fp8_linear.quant_fp8.enabled()
+
     def forward(self, x):
         resid = torch.sqrt(x)
         y = self.norm[0](x)
@@ -82,7 +86,18 @@ def forward(self, x):
         return y3
 
     def ops_in_model_before(self):
-        return [QUANT_OPS[self.key]]
+        ops = []
+        if self.enable_rms_norm:
+            ops += [RMS_OP]
+        else:
+            ops += [torch.ops.aten.rsqrt.default]
+
+        if self.enable_quant_fp8:
+            ops += [QUANT_OPS[self.key]]
+        else:
+            ops += [torch.ops.aten.reciprocal.default]
+
+        return ops
 
     def ops_in_model_after(self):
         return [
@@ -91,11 +106,13 @@ def ops_in_model_after(self):
         ]
 
 
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float16])  # , torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [64])
 @pytest.mark.parametrize("num_tokens", [257])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
+@pytest.mark.parametrize("enable_rms_norm", [True])  # , False])
+@pytest.mark.parametrize("enable_quant_fp8", [True])  # , False])
 # cuda_force_torch used to test torch code path on platforms that
 # cutlass_fp8_supported() == True.
 @pytest.mark.parametrize(
@@ -105,17 +122,29 @@ def ops_in_model_after(self):
     not current_platform.is_cuda_alike(), reason="Only test on CUDA and ROCm"
 )
 def test_fusion_rmsnorm_quant(
-    dtype, hidden_size, num_tokens, eps, static, cuda_force_torch
+    dtype,
+    hidden_size,
+    num_tokens,
+    eps,
+    static,
+    enable_rms_norm,
+    enable_quant_fp8,
+    cuda_force_torch,
 ):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
     maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
 
+    custom_ops = []
+    if enable_rms_norm:
+        custom_ops.append("+rms_norm")
+    if enable_quant_fp8:
+        custom_ops.append("+quant_fp8")
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             level=CompilationLevel.PIECEWISE,
-            custom_ops=["+rms_norm", "+quant_fp8"],
+            custom_ops=custom_ops,
             pass_config=PassConfig(enable_fusion=True, enable_noop=True),
         )
     )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1507,7 +1507,7 @@ def scaled_fp8_quant(
                 output, input, scale, scale_ub
             )
         else:
-            scale = torch.empty(1, device=input.device, dtype=torch.float32)
+            scale = torch.empty((1, 1), device=input.device, dtype=torch.float32)
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         assert scale.numel() == 1, f"{scale.shape}"
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
@@ -24,6 +24,7 @@
 from vllm.platforms import current_platform
 
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherQuant, MatcherRMSNorm
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
@@ -99,6 +100,9 @@ def __init__(self, epsilon: float, key: FusedRMSQuantKey):
         assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
 
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuant(key.quant)
+
 
 class RMSNormStaticQuantPattern(RMSNormQuantPattern):
     def __init__(self, epsilon: float, quant_dtype: torch.dtype, symmetric=True):
@@ -113,25 +117,8 @@ def __init__(self, epsilon: float, quant_dtype: torch.dtype, symmetric=True):
     def register(self, pm_pass: PatternMatcherPass):
         # Cannot use methods, as the self argument affects tracing
         def pattern(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
-            result_rms = torch.empty_like(input)
-            # TODO: why does empty_like produce a permute but
-            #  empty via shape doesn't?
-            result = torch.empty(
-                input.shape, device=input.device, dtype=self.quant_dtype
-            )
-            at1 = auto_functionalized(
-                RMS_OP,
-                result=result_rms,
-                input=input,
-                weight=weight,
-                epsilon=self.epsilon,
-            )
-            at2 = auto_functionalized(
-                self.QUANT_OP, result=result, input=at1[1], scale=scale
-            )
-
-            # result
-            return at2[1]
+            result_rms = self.rmsnorm_matcher(input, weight)
+            return self.quant_matcher(result_rms, scale)
 
         def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
             result = torch.empty_like(input, dtype=self.quant_dtype)
@@ -173,22 +160,10 @@ def pattern(
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
-            result = torch.empty(
-                input.shape, device=input.device, dtype=self.quant_dtype
-            )
-            at = auto_functionalized(
-                RMS_ADD_OP,
-                input=input,
-                residual=residual,
-                weight=weight,
-                epsilon=self.epsilon,
-            )
-            at1 = auto_functionalized(
-                self.QUANT_OP, result=result, input=at[1], scale=scale
-            )
+            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result = self.quant_matcher(result_rms, scale)
 
-            # result, residual
-            return at1[1], at[2]
+            return result, residual
 
         def replacement(
             input: torch.Tensor,
@@ -242,27 +217,14 @@ def __init__(
         super().__init__(epsilon, key)
 
     def register(self, pm_pass: PatternMatcherPass):
-        def pattern(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
-            result_rms = torch.empty_like(input)
-            result = torch.empty(
-                input.shape, device=input.device, dtype=self.quant_dtype
-            )
-            at1 = auto_functionalized(
-                RMS_OP,
-                result=result_rms,
-                input=input,
-                weight=weight,
-                epsilon=self.epsilon,
-            )
-            at2 = auto_functionalized(
-                self.QUANT_OP, result=result, input=at1[1], scale=scale, scale_ub=None
-            )
-
+        def pattern(input: torch.Tensor, weight: torch.Tensor):
+            result_rms = self.rmsnorm_matcher(input, weight)
             # result, scale
-            return at2[1], at2[2]
+            return self.quant_matcher(result_rms)
 
-        def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
+        def replacement(input: torch.Tensor, weight: torch.Tensor):
             result = torch.empty_like(input, dtype=self.quant_dtype)
+            scale = self.quant_matcher.make_scale(input)
             at = auto_functionalized(
                 self.FUSED_OP,
                 result=result,
@@ -280,7 +242,6 @@ def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
         inputs = [
             empty_bf16(5, 4),  # input
             empty_bf16(1, 5),  # weight
-            empty_fp32(1, 1),  # scale
         ]
 
         pm.register_replacement(
@@ -308,36 +269,17 @@ def __init__(
         super().__init__(epsilon, key)
 
     def register(self, pm_pass: PatternMatcherPass):
-        def pattern(
-            input: torch.Tensor,
-            residual: torch.Tensor,
-            weight: torch.Tensor,
-            scale: torch.Tensor,
-        ):
-            result = torch.empty(
-                input.shape, device=input.device, dtype=self.quant_dtype
-            )
-            at = auto_functionalized(
-                RMS_ADD_OP,
-                input=input,
-                residual=residual,
-                weight=weight,
-                epsilon=self.epsilon,
-            )
-            at1 = auto_functionalized(
-                self.QUANT_OP, result=result, input=at[1], scale=scale, scale_ub=None
-            )
+        def pattern(input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor):
+            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result, scale = self.quant_matcher(result_rms)
 
-            # result, residual, scale
-            return at1[1], at[2], at1[2]
+            return result, residual, scale
 
         def replacement(
-            input: torch.Tensor,
-            residual: torch.Tensor,
-            weight: torch.Tensor,
-            scale: torch.Tensor,
+            input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor
         ):
             result = torch.empty_like(input, dtype=self.quant_dtype)
+            scale = self.quant_matcher.make_scale(input)
             at = auto_functionalized(
                 self.FUSED_OP,
                 result=result,
@@ -356,7 +298,6 @@ def replacement(
             empty_bf16(5, 4),  # input
             empty_bf16(5, 4),  # residual
             empty_bf16(1, 5),  # weight
-            empty_fp32(1, 1),  # scale
         ]
 
         pm.register_replacement(
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional, Union
+
+import torch
+from torch._higher_order_ops import auto_functionalized
+from torch._ops import OpOverload
+
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    _normalize_quant_group_shape,
+    kFp8DynamicTensorSym,
+    kFp8DynamicTokenSym,
+    kFp8StaticTensorSym,
+)
+
+RMS_OP = torch.ops._C.rms_norm.default
+RMS_ADD_OP = torch.ops._C.fused_add_rms_norm.default
+
+QUANT_OPS: dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
+    kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
+    kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
+}
+
+# TODO
+# if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
+#     QUANT_OPS[
+#         kNvfp4Quant] = torch.ops._C.scaled_fp4_quant.default  # noqa: E501
+
+
+class MatcherRMSNorm:
+    def __init__(self, epsilon: float):
+        self.epsilon = epsilon
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        if residual is None:
+            result = torch.empty_like(input)
+            _, result = auto_functionalized(
+                RMS_OP,
+                result=result,
+                input=input,
+                weight=weight,
+                epsilon=self.epsilon,
+            )
+
+            return result
+        else:
+            _, result, residual = auto_functionalized(
+                RMS_ADD_OP,
+                input=input,
+                residual=residual,
+                weight=weight,
+                epsilon=self.epsilon,
+            )
+
+            return result, residual
+
+    def __call__(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        return self.forward(input, weight, residual)
+
+
+class MatcherQuant:
+    def __init__(self, quant_key: QuantKey):
+        self.quant_key = quant_key
+        assert quant_key in QUANT_OPS, f"unsupported quantization scheme {quant_key}"
+        self.QUANT_OP = QUANT_OPS[quant_key]
+
+    def forward(
+        self, input: torch.Tensor, scale: Optional[torch.Tensor] = None
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        # TODO: why does empty_like produce a permute but
+        #  empty via shape doesn't?
+        result = torch.empty(
+            input.shape, device=input.device, dtype=self.quant_key.dtype
+        )
+
+        if self.quant_key.scale.static:
+            assert scale is not None
+            _, result = auto_functionalized(
+                self.QUANT_OP, result=result, input=input, scale=scale
+            )
+            return result
+        else:
+            assert scale is None
+            scale = self.make_scale(input)
+            _, result, scale = auto_functionalized(
+                self.QUANT_OP, result=result, input=input, scale=scale, scale_ub=None
+            )
+            return result, scale
+
+    def make_scale(self, input: torch.Tensor):
+        normalized_group_shape = _normalize_quant_group_shape(
+            input, self.quant_key.scale.group_shape
+        )
+        scale_shape = (
+            input.shape[0] // normalized_group_shape[0],
+            input.shape[1] // normalized_group_shape[1],
+        )
+
+        return torch.empty(scale_shape, device=input.device, dtype=torch.float32)
+
+    def __call__(
+        self, input: torch.Tensor, scale: Optional[torch.Tensor] = None
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        return self.forward(input, scale)
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py

Original file line number	Diff line number	Diff line change
`@@ -1507,7 +1507,7 @@ def scaled_fp8_quant(`
`1507`	`1507`	`output, input, scale, scale_ub`
`1508`	`1508`	`)`
`1509`	`1509`	`else:`
`1510`		`- scale = torch.empty(1, device=input.device, dtype=torch.float32)`
	`1510`	`+ scale = torch.empty((1, 1), device=input.device, dtype=torch.float32)`
`1511`	`1511`	`torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)`
`1512`	`1512`	`else:`
`1513`	`1513`	`assert scale.numel() == 1, f"{scale.shape}"`