quant with fix for pure torch, broke others

ProExpertProg · ProExpertProg · commit 14fdc8b9d51a · 2025-10-11T19:42:50.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
@@ -147,10 +147,8 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
         model2 = torch.compile(model, backend=backend)
         result2 = model2(x)
 
-        # Higher tol for dynamic, even higher for bfloat16
-        if static:
-            ATOL, RTOL = (1e-3, 1e-3)
-        elif dtype == torch.float16:
+        # Higher tol for dynamic bfloat16
+        if dtype == torch.float16 or static:
             ATOL, RTOL = (2e-3, 2e-3)
         else:
             ATOL, RTOL = (1e-2, 1e-2)
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
@@ -26,7 +26,7 @@
 
 
 def empty_bf16(*args, **kwargs):
-    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
+    return torch.empty(*args, **kwargs, dtype=torch.float16, device="cuda")
 
 
 def empty_fp32(*args, **kwargs):
@@ -133,7 +133,7 @@ def replacement(input: torch.Tensor, weight: torch.Tensor,
             return at[1]
 
         inputs = [
-            empty_bf16(5, 4),  # input
+            empty_fp32(5, 4),  # input # TODO: rms_input
             empty_bf16(4, ),  # weight
             empty_fp32(1, 1)  # scale
         ]
@@ -185,8 +185,8 @@ def replacement(input: torch.Tensor, residual: torch.Tensor,
             return at[1], at[2]
 
         inputs = [
-            # TODO: maybe 32bit for torch impl?
-            #  TODO dtype doesn't seem to matter?
+            # TODO: maybe 32bit for torch impl? yes to resolve bug
+            #  TODO dtype doesn't seem to matter? it does matter for what cvts get traced
             empty_bf16(5, 4),  # input
             empty_bf16(5, 4),  # residual
             empty_bf16(4, ),  # weight
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
@@ -43,6 +43,10 @@ def __init__(self, epsilon: float, enabled: Optional[bool] = None):
 
         self.forward = self.forward_custom if enabled else self.forward_native
         self.model_dtype = get_current_vllm_config().model_config.dtype
+        print(self.model_dtype)
+
+    def inputs(self):
+        return
 
     def forward_custom(
         self,
@@ -76,10 +80,10 @@ def forward_native(
         weight: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        x = input  # .to(torch.float32)
+        x = input.to(torch.float32)
         if residual is not None:
-            x = x + residual.to(torch.float32)
-            residual = x  # conversion to 16-bit is eliminated in full graph
+            x = x + residual
+            residual = x.to(self.model_dtype)
 
         variance = x.pow(2).mean(dim=-1, keepdim=True)