From 5267f4f9ed34d85a07e6704e08b36144fa9e77ec Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Fri, 7 Mar 2025 00:12:17 +0000
Subject: [PATCH 1/3] Fixing the shape to use in padding calculation

---
 tests/kernels/test_moe.py                     | 34 +++++++++----------
 .../layers/fused_moe/fused_moe.py             |  2 +-
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index afb8b9f426a2..f42f20de2c37 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -54,14 +54,6 @@ def test_fused_moe(
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
-
-    # Pad the input if use padding
-    if envs.VLLM_MOE_PADDING:
-        w1 = F.pad(w1, (0, 128), "constant", 0)
-        torch.cuda.empty_cache()
-        w2 = F.pad(w2, (0, 128), "constant", 0)
-        torch.cuda.empty_cache()
-
     if ep_size > 1:
         local_e = e // ep_size
         e_ids = torch.randint(0,
@@ -75,16 +67,7 @@ def test_fused_moe(
     else:
         e_map = None
 
-    triton_output = fused_moe(a,
-                              w1,
-                              w2,
-                              score,
-                              topk,
-                              global_num_experts=e,
-                              expert_map=e_map,
-                              renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk, e_map)
-    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
     iterative_output = iterative_moe(a,
                                      w1,
                                      w2,
@@ -93,6 +76,23 @@ def test_fused_moe(
                                      global_num_experts=e,
                                      expert_map=e_map,
                                      renormalize=False)
+    # Pad the input if use padding
+    if envs.VLLM_MOE_PADDING:
+        w1 = F.pad(w1, (0, 128), "constant", 0)
+        torch.cuda.empty_cache()
+        w2 = F.pad(w2, (0, 128), "constant", 0)
+        torch.cuda.empty_cache()
+
+    triton_output = fused_moe(a,
+                              w1,
+                              w2,
+                              score,
+                              topk,
+                              global_num_experts=e,
+                              expert_map=e_map,
+                              renormalize=False)
+
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
     torch.testing.assert_close(iterative_output,
                                torch_output,
                                atol=1e-2,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 37be0fd9227f..7806a8de5e56 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -770,7 +770,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             expert_ids,
             num_tokens_post_padded,
             B.shape[1],
-            A.shape[1] - padding_size,
+            B.shape[2] - padding_size,
             EM,
             topk_ids.numel(),
             A.stride(0),

From 8cd476333f3bbc4410b170c2efc6bbc8896cf000 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Fri, 7 Mar 2025 16:22:25 +0000
Subject: [PATCH 2/3] Assertion on the int8 quantized moe

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 7806a8de5e56..7e3d05509bb5 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -719,6 +719,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             block_shape is not None and block_shape[1] > 0:
         assert B_scale is not None and B_scale.ndim == 3
         assert B_zp is None or B_zp.ndim == 3
+        assert padding_size == 0, "MoE padding is not supported " \
+              "with GPTQ/AWQ quantization"
 
         fused_moe_kernel_gptq_awq[grid](
             A,

From 15bd5664ac5dd7ff2330b5688fd84d8e4380774a Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Fri, 7 Mar 2025 17:33:42 +0000
Subject: [PATCH 3/3] Properly testing for padding

---
 tests/kernels/test_moe.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index f42f20de2c37..87989354db6a 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -3,6 +3,8 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+import unittest.mock as mock
+
 import pytest
 import torch
 from torch.nn import Parameter
@@ -40,6 +42,7 @@
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
 def test_fused_moe(
     m: int,
     n: int,
@@ -48,7 +51,15 @@ def test_fused_moe(
     topk: int,
     ep_size: int,
     dtype: torch.dtype,
+    padding: bool,
 ):
+    if padding:
+        padding_size = 128
+        envs.VLLM_MOE_PADDING = True
+    else:
+        padding_size = 0
+        envs.VLLM_MOE_PADDING = False
+
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
@@ -83,14 +94,17 @@ def test_fused_moe(
         w2 = F.pad(w2, (0, 128), "constant", 0)
         torch.cuda.empty_cache()
 
-    triton_output = fused_moe(a,
-                              w1,
-                              w2,
-                              score,
-                              topk,
-                              global_num_experts=e,
-                              expert_map=e_map,
-                              renormalize=False)
+    with mock.patch(
+            'vllm.model_executor.layers.fused_moe.fused_moe.padding_size',
+            padding_size):
+        triton_output = fused_moe(a,
+                                  w1,
+                                  w2,
+                                  score,
+                                  topk,
+                                  global_num_experts=e,
+                                  expert_map=e_map,
+                                  renormalize=False)
 
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
     torch.testing.assert_close(iterative_output,