Fix func test

ProExpertProg · ProExpertProg · commit bcd95b5f67a0 · 2025-10-15T11:54:47.000-04:00
Signed-off-by: Luka Govedič &lt;lgovedic@redhat.com&gt;
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
@@ -216,6 +216,8 @@ void fused_add_rms_norm_static_fp8_quant(
     double epsilon) {
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(residual.is_contiguous());
+  TORCH_CHECK(residual.scalar_type() == input.scalar_type());
+  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
   int hidden_size = input.size(-1);
   int input_stride = input.stride(-2);
   int num_tokens = input.numel() / hidden_size;
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
@@ -54,8 +54,7 @@ def forward(self, x):
             return y
 
     def example_inputs(self, num_tokens=32, hidden_size=128):
-        dtype = torch.float16 if TEST_FP8 else torch.float32
-        return (torch.rand(num_tokens, hidden_size * 2, dtype=dtype),)
+        return (torch.rand(num_tokens, hidden_size * 2),)
 
     def ops_in_model(self, do_fusion):
         if TEST_FP8 and do_fusion:
@@ -73,15 +72,11 @@ def __init__(self, hidden_size=16, intermediate_size=32):
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
 
-        dtype = torch.float16 if TEST_FP8 else torch.float32
-
         self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size), dtype=dtype)
+            torch.empty((intermediate_size, hidden_size))
         )
         self.norm = RMSNorm(intermediate_size, 1e-05)
-        self.norm.weight = torch.nn.Parameter(
-            torch.ones(intermediate_size, dtype=dtype)
-        )
+        self.norm.weight = torch.nn.Parameter(torch.ones(intermediate_size))
 
         torch.nn.init.normal_(self.gate_proj, std=0.02)
 
@@ -118,9 +113,8 @@ def forward(self, hidden_states, residual):
             return norm_output, residual_output
 
     def example_inputs(self, batch_size=8, hidden_size=16, seq_len=16):
-        dtype = torch.float16 if TEST_FP8 else torch.float32
-        hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
-        residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
+        hidden_states = torch.randn((batch_size * seq_len, hidden_size))
+        residual = torch.randn((batch_size * seq_len, hidden_size))
         return (hidden_states, residual)
 
     def ops_in_model(self, do_fusion):
@@ -151,10 +145,9 @@ def forward(self, positions, q, k):
         return q_rotated, k_rotated
 
     def example_inputs(self, num_tokens=32, head_dim=64):
-        dtype = torch.float16
         positions = torch.arange(num_tokens, dtype=torch.long)
-        q = torch.randn(num_tokens, head_dim, dtype=dtype)
-        k = torch.randn(num_tokens, head_dim, dtype=dtype)
+        q = torch.randn(num_tokens, head_dim)
+        k = torch.randn(num_tokens, head_dim)
         return (positions, q, k)
 
     def ops_in_model(self, do_fusion):
@@ -172,7 +165,7 @@ def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000):
         self.hidden_size = head_dim * num_heads
 
         self.qkv_proj = torch.nn.Linear(
-            self.hidden_size, self.hidden_size * 3, bias=False, dtype=torch.float16
+            self.hidden_size, self.hidden_size * 3, bias=False
         )
 
         self.rotary_emb = get_rope(
@@ -196,10 +189,9 @@ def forward(self, positions, hidden_states):
         return qkv_updated
 
     def example_inputs(self, num_tokens=32, head_dim=64, num_heads=4):
-        dtype = torch.float16
         hidden_size = head_dim * num_heads
         positions = torch.arange(num_tokens, dtype=torch.long)
-        hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
+        hidden_states = torch.randn(num_tokens, hidden_size)
         return (positions, hidden_states)
 
     def ops_in_model(self, do_fusion):
@@ -217,14 +209,18 @@ def ops_not_in_model(self):
 ]
 
 
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("model_class", MODELS)
 @pytest.mark.parametrize("do_fusion", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", reason="Only test on CUDA")
-def test_fix_functionalization(model_class: torch.nn.Module, do_fusion: bool):
+def test_fix_functionalization(
+    model_class: torch.nn.Module, do_fusion: bool, dtype: torch.dtype
+):
     torch.set_default_device("cuda")
+    torch.set_default_dtype(dtype)
 
     vllm_config = VllmConfig(
-        model_config=ModelConfig(dtype=torch.bfloat16),
+        model_config=ModelConfig(dtype=dtype),
         compilation_config=CompilationConfig(
             custom_ops=["all"],
             pass_config=PassConfig(enable_fusion=do_fusion, enable_noop=True),