Fix tests, PR feedback

ProExpertProg · ProExpertProg · commit 876ef22e1e29 · 2025-10-15T18:43:48.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
@@ -169,24 +169,29 @@ def test_fusion_rmsnorm_quant(
         cleanup_pass = PostCleanupPass(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass, cleanup_pass)
+        backend2 = TestBackend(noop_pass, cleanup_pass)
         model = TestModel(hidden_size, eps, static, cuda_force_torch)
 
         # First dimension dynamic
         x = torch.rand(num_tokens, hidden_size)
         torch._dynamo.mark_dynamic(x, 0)
 
-        result = model(x)
+        model_fused = torch.compile(model, backend=backend)
+        result_fused = model_fused(x)
 
-        model2 = torch.compile(model, backend=backend)
-        result2 = model2(x)
+        model_unfused = torch.compile(model, backend=backend2)
+        result_unfused = model_unfused(x)
 
-        # Higher tol for dynamic bfloat16
-        if dtype == torch.float16 or static:
+        if enable_rms_norm_custom_op and static:
+            ATOL, RTOL = (1e-5, 1e-5)  # up to 1e-8 close
+        elif dtype == torch.float16:
             ATOL, RTOL = (2e-3, 2e-3)
+        elif static:
+            ATOL, RTOL = (5e-3, 5e-3)
         else:
             ATOL, RTOL = (1e-2, 1e-2)
 
-        torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
+        torch.testing.assert_close(result_fused, result_unfused, atol=ATOL, rtol=RTOL)
 
         assert fusion_pass.matched_count == 3
         backend.check_before_ops(model.ops_in_model_before())
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
@@ -18,6 +18,7 @@
     ModelConfig,
     PassConfig,
     VllmConfig,
+    get_current_vllm_config,
     set_current_vllm_config,
 )
 from vllm.distributed import tensor_model_parallel_all_reduce
@@ -94,13 +95,11 @@ def ops_in_model(self):
 
 
 class TestQuantModel(torch.nn.Module):
-    def __init__(
-        self, hidden_size=16, intermediate_size=32, vllm_config: VllmConfig = None
-    ):
+    def __init__(self, hidden_size=16, intermediate_size=32):
         super().__init__()
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
-        self.vllm_config = vllm_config
+        self.vllm_config = get_current_vllm_config()
         self.gate_proj = torch.nn.Parameter(
             torch.empty((intermediate_size, hidden_size)), requires_grad=False
         )
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
@@ -33,7 +33,7 @@
 
 
 def empty_bf16(*args, **kwargs):
-    return torch.empty(*args, **kwargs, dtype=torch.float16, device="cuda")
+    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
 
 
 def empty_fp32(*args, **kwargs):
@@ -144,7 +144,7 @@ def replacement(input: torch.Tensor, weight: torch.Tensor, scale: torch.Tensor):
         inputs = [
             # input, weight
             *self.rmsnorm_matcher.inputs(),
-            empty_fp32(1, 1),  # scale
+            self.quant_matcher.inputs()[1],  # scale
         ]
         pattern(*inputs)
 
@@ -200,7 +200,7 @@ def replacement(
         inputs = [
             # input, weight, residual
             *self.rmsnorm_matcher.inputs(),
-            empty_fp32(1, 1),  # scale
+            self.quant_matcher.inputs()[1],  # scale
         ]
 
         pm.register_replacement(
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
@@ -112,9 +112,7 @@ def __init__(self, epsilon: float, enabled: bool | None = None):
 
     def inputs(self):
         input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
-        weight = self.empty(
-            16,
-        )
+        weight = self.empty(16)
         residual = self.empty(5, 16)
         return [input, weight, residual]
 
@@ -203,3 +201,10 @@ def make_scale(self, input: torch.Tensor):
         )
 
         return torch.empty(scale_shape, device=input.device, dtype=torch.float32)
+
+    def inputs(self) -> list[torch.Tensor]:
+        input = self.empty(5, 16)
+        if self.quant_key.scale.static:
+            return [input, self.empty_f32(1, 1)]
+
+        return [input]