Simplify matcher utils by using RMSNorm.forward_static

ProExpertProg · ProExpertProg · commit 095277ca89b8 · 2025-10-11T23:33:34.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
@@ -65,8 +65,6 @@ def inputs(self) -> list[torch.Tensor]:
 class MatcherRMSNorm(MatcherCustomOp):
     def __init__(self, epsilon: float, enabled: Optional[bool] = None):
         if enabled is None:
-            # TODO either pass config to enabled or set it globally
-            #  (global during pass init seems reasonable)
             enabled = RMSNorm.enabled()
 
         super().__init__(enabled)
@@ -83,7 +81,6 @@ def forward_custom(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         result = torch.empty_like(input)
         _, result = auto_functionalized(
@@ -100,28 +97,15 @@ def forward_native(
         self,
         input: torch.Tensor,
         weight: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        x = input.to(torch.float32)
-        if residual is not None:
-            x = x + residual
-            residual = x.to(self.model_dtype)
-
-        variance = x.pow(2).mean(dim=-1, keepdim=True)
-
-        x = x * torch.rsqrt(variance + self.epsilon)
-        x = x.to(self.model_dtype)
-        if weight is not None:
-            x = x * weight
-
-        return x if residual is None else (x, residual)
+        return RMSNorm.forward_static(
+            input, self.epsilon, input.size(-1), self.model_dtype, weight
+        )
 
 
 class MatcherFusedAddRMSNorm(MatcherCustomOp):
     def __init__(self, epsilon: float, enabled: Optional[bool] = None):
         if enabled is None:
-            # TODO either pass config to enabled or set it globally
-            #  (global during pass init seems reasonable)
             enabled = RMSNorm.enabled()
 
         super().__init__(enabled)
@@ -157,19 +141,9 @@ def forward_native(
         weight: torch.Tensor,
         residual: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        x = input.to(torch.float32)
-        if residual is not None:
-            x = x + residual
-            residual = x.to(self.model_dtype)
-
-        variance = x.pow(2).mean(dim=-1, keepdim=True)
-
-        x = x * torch.rsqrt(variance + self.epsilon)
-        x = x.to(self.model_dtype)
-        if weight is not None:
-            x = x * weight
-
-        return x if residual is None else (x, residual)
+        return RMSNorm.forward_static(
+            input, self.epsilon, input.size(-1), self.model_dtype, weight, residual
+        )
 
 
 class MatcherQuant:
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
@@ -187,12 +187,12 @@ def forward_static(
         x: torch.Tensor,
         variance_epsilon: float,
         hidden_size: int,
+        orig_dtype: torch.dtype,
         weight: Optional[torch.Tensor] = None,
         residual: Optional[torch.Tensor] = None,
         variance_size_override: Optional[int] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         """PyTorch-native implementation equivalent to forward()."""
-        orig_dtype = x.dtype
         x = x.to(torch.float32)
         if residual is not None:
             # residual promoted f16->f32 automatically,
@@ -239,6 +239,7 @@ def forward_native(
             x,
             self.variance_epsilon,
             self.hidden_size,
+            x.dtype,
             self.weight.data if self.has_weight else None,
             residual,
             self.variance_size_override,