Support FusedMoE LoRA Triton kernel for mxfp4 model

xyang16 · xyang16 · commit 0bb53d7c997b · 2025-11-18T17:34:30.000-08:00
Signed-off-by: Xin Yang &lt;xyangx@amazon.com&gt;
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
@@ -28,6 +28,9 @@
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    modular_oai_triton_fused_moe,
+)
 
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
@@ -108,15 +111,23 @@ def _inject_lora_into_fused_moe(self):
         self.base_layer.ensure_moe_quant_config_init()
         quant_config = self.base_layer.quant_method.moe_quant_config
 
-        m_fused_moe_fn = (
-            modular_triton_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
+        if quant_config.use_mxfp4_w4a16:
+            from vllm.model_executor.layers.quantization.mxfp4 import Mxfp4Backend
+
+            mxfp4_backend = self.base_layer.quant_method.mxfp4_backend
+            m_fused_moe_fn = (
+                modular_oai_triton_fused_moe(
+                    quant_config, shared_experts=self.base_layer.shared_experts
+                )
+                if mxfp4_backend == Mxfp4Backend.TRITON
+                else modular_marlin_fused_moe(
+                    quant_config, shared_experts=self.base_layer.shared_experts
+                )
             )
-            if not quant_config.use_mxfp4_w4a16
-            else modular_marlin_fused_moe(
+        else:
+            m_fused_moe_fn = modular_triton_fused_moe(
                 quant_config, shared_experts=self.base_layer.shared_experts
             )
-        )
 
         def fwd_decorator(layer, func):
             def wrapper(*args, **kwargs):
@@ -279,9 +290,11 @@ def wrapper(*args, **kwargs):
         fused_experts.activation = act_decorator(
             self.base_layer, fused_experts.activation
         )
+        fused_experts.fuse_act = False
         fused_experts.moe_sum = moe_sum_decorator(
             self.base_layer, fused_experts.moe_sum
         )
+        fused_experts.fuse_sum = False
 
         self.base_layer.quant_method = FusedMoEModularMethod(
             self.base_layer.quant_method, m_fused_moe_fn
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -430,7 +430,7 @@ def _fused_moe_lora(
         == expert_ids.shape[0]
         == num_tokens_post_padded.shape[0]
     )
-    assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1]
+    assert output.shape[-1] // lora_b_stacked[0].shape[-2] == len(lora_b_stacked)
     assert output.shape[0] == topk_weights.shape[0]
     assert top_k_num == topk_weights.shape[1]
     device = qcurr_hidden_states.device
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,17 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Callable
+
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
     FusedMoEQuantConfig,
 )
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
+from vllm.model_executor.layers.fused_moe.utils import (
+    _resize_cache,
+)
 from vllm.triton_utils import tl, triton
 from vllm.utils.import_utils import has_triton_kernels
 
@@ -96,6 +105,7 @@ def triton_kernel_moe_forward(
         routing_data,
         gather_idx,
         scatter_idx,
+        topk=topk,
         activation=activation,
         quant_config=quant_config,
         apply_router_weight_on_input=apply_router_weight_on_input,
@@ -113,14 +123,21 @@ def triton_kernel_fused_experts(
     routing_data,  # RoutingData
     gather_indx,  # GatherIndx
     scatter_indx,  # ScatterIndx
+    topk: int,
     activation: str = "silu",
+    activation_func: Callable[[str, torch.Tensor, torch.Tensor], None] = None,
+    moe_sum: Callable[[torch.Tensor, torch.Tensor], None] | None = None,
     quant_config: FusedMoEQuantConfig | None = None,
     swiglu_alpha: float = 1.702,
     swiglu_limit: float = 7.0,
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
+    intermediate_cache13: torch.Tensor | None = None,
+    intermediate_cache2: torch.Tensor | None = None,
     a1q_scale: torch.Tensor | None = None,
+    fuse_act: bool = True,
+    fuse_sum: bool = True,
 ) -> torch.Tensor:
     if quant_config is None:
         quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
@@ -134,16 +151,20 @@ def triton_kernel_fused_experts(
     assert hidden_states.shape[-1] == w1.shape[-2]
     assert w2.shape[-1] == w1.shape[1]
 
+    M, K = hidden_states.shape
     E, _, N = w1.shape
 
     if global_num_experts == -1:
         global_num_experts = E
 
-    act = FusedActivation(
-        FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
-        (swiglu_alpha, swiglu_limit),
-        2,
-    )
+    if not fuse_act:
+        act = None
+    else:
+        act = FusedActivation(
+            FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
+            (swiglu_alpha, swiglu_limit),
+            2,
+        )
     gammas = routing_data.gate_scal if routing_data else None
 
     intermediate_cache1 = matmul_ogs(
@@ -157,16 +178,35 @@ def triton_kernel_fused_experts(
         fused_activation=act,
     )
 
+    if not fuse_act:
+        intermediate_cache2 = _resize_cache(intermediate_cache2, (M * topk, N // 2))
+        activation_func(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+    else:
+        intermediate_cache2 = intermediate_cache1
+
+    n_expts_act = routing_data.n_expts_act
+    if not fuse_sum:
+        routing_data.n_expts_act = 1
+
     intermediate_cache3 = matmul_ogs(
-        intermediate_cache1,
+        intermediate_cache2,
         w2,
         quant_config.w2_bias,
         routing_data,
         scatter_indx=scatter_indx,
         precision_config=quant_config.w2_precision,
         gammas=None if apply_router_weight_on_input else gammas,
-        y=output_tensor,
     )
+
+    if not fuse_sum:
+        moe_sum(intermediate_cache3.view(-1, topk, K), output_tensor)
+
+        # Set the original n_expts_act back
+        routing_data.n_expts_act = n_expts_act
+        return output_tensor
+
     return intermediate_cache3
 
 
@@ -239,6 +279,8 @@ def __init__(self, quant_config: FusedMoEQuantConfig):
         # TODO (varun) : Enable activation quantization
         assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
         super().__init__(quant_config)
+        self.fuse_act = True
+        self.fuse_sum = True
 
     @property
     def activation_formats(
@@ -263,7 +305,7 @@ def workspace_shapes(
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
-        workspace1 = (M, K)
+        workspace1 = (M, topk, max(N // 2, K))
         workspace2 = (0, 0)
         output = (M, K)
         return (workspace1, workspace2, output)
@@ -297,20 +339,39 @@ def apply(
             topk_ids, topk_weights, local_num_experts
         )
 
-        experts_output = triton_kernel_fused_experts(
-            None,
+        topk = topk_ids.size(1)
+        triton_kernel_fused_experts(
+            output,
             hidden_states,
             w1,
             w2,
             routing_data,
             gather_indx,
             scatter_indx,
+            topk=topk,
             activation=activation,
+            activation_func=self.activation,
+            moe_sum=self.moe_sum,
             quant_config=self.quant_config,
             apply_router_weight_on_input=False,
             global_num_experts=local_num_experts,
             expert_map=None,  # applied already
+            intermediate_cache13=workspace2,
+            intermediate_cache2=workspace13,
             a1q_scale=a1q_scale,
+            fuse_act=self.fuse_act,
+            fuse_sum=self.fuse_sum,
         )
 
-        output.copy_(experts_output, non_blocking=True)
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor):
+        ops.moe_sum(input, output)
+
+
+def modular_oai_triton_fused_moe(
+    quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None
+) -> mk.FusedMoEModularKernel:
+    return mk.FusedMoEModularKernel(
+        MoEPrepareAndFinalizeNoEP(),
+        OAITritonExperts(quant_config),
+        shared_experts,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -218,7 +218,6 @@ def maybe_roundup_hidden_size(
     act_dtype: torch.dtype,
     quant_config: QuantizationConfig | None,
     moe_parallel_config: FusedMoEParallelConfig,
-    is_lora_enabled: bool,
 ) -> int:
     """
     Given layer hidden size and MoE configurations, round up hidden_size
@@ -252,7 +251,7 @@ def maybe_roundup_hidden_size(
             get_mxfp4_backend,
         )
 
-        current_mxfp4_backend = get_mxfp4_backend(is_lora_enabled)
+        current_mxfp4_backend = get_mxfp4_backend()
         if (
             current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
             or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
@@ -386,7 +385,6 @@ def __init__(
             moe_in_dtype,
             quant_config,
             self.moe_parallel_config,
-            is_lora_enabled=self.vllm_config.lora_config is not None,
         )
 
         # For smuggling this layer into the fused moe custom op
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1087,7 +1087,7 @@ def _finalize(
         if not self.prepare_finalize.supports_async():
             assert not dbo_enabled()
 
-            self.prepare_finalize.finalize(
+            output = self.prepare_finalize.finalize(
                 output,
                 fused_out,
                 topk_weights,
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -65,10 +65,10 @@ def finalize(
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
         weight_and_reduce_impl: mk.TopKWeightAndReduce,
-    ) -> None:
+    ) -> torch.Tensor:
         if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
             weight_and_reduce_impl = TopKWeightAndReduceContiguous()
-        weight_and_reduce_impl.apply(
+        return weight_and_reduce_impl.apply(
             output=output,
             fused_expert_output=fused_expert_output,
             topk_weights=topk_weights,
diff --git a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
@@ -68,8 +68,7 @@ def apply(
             f"But got output={output.size()}, "
             f"used_expert_output={fused_expert_output.size()}"
         )
-        output.copy_(fused_expert_output, non_blocking=True)
-        return output
+        return fused_expert_output
 
 
 class TopKWeightAndReduceContiguous(mk.TopKWeightAndReduce):
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -74,24 +74,9 @@ class Mxfp4Backend(Enum):
     TRITON = 6
 
 
-def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
-    """
-    Not all MXFP4 backends support LoRA. Select backends that are known to
-    have LoRA support.
-    """
-    if not current_platform.is_cuda():
-        return Mxfp4Backend.NONE
-
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-    return Mxfp4Backend.MARLIN
-
-
-def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
+def get_mxfp4_backend() -> Mxfp4Backend:
     # Backend Selection
 
-    if with_lora_support:
-        return get_mxfp4_backend_with_lora()
-
     if current_platform.is_cuda():
         if (
             current_platform.is_device_capability(90)
@@ -215,7 +200,7 @@ def get_quant_method(
 class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
-        self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+        self.mxfp4_backend = get_mxfp4_backend()
         self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size

Original file line number	Diff line number	Diff line change
`@@ -430,7 +430,7 @@ def _fused_moe_lora(`
`430`	`430`	`== expert_ids.shape[0]`
`431`	`431`	`== num_tokens_post_padded.shape[0]`
`432`	`432`	`)`
`433`		`- assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1]`
	`433`	`+ assert output.shape[-1] // lora_b_stacked[0].shape[-2] == len(lora_b_stacked)`
`434`	`434`	`assert output.shape[0] == topk_weights.shape[0]`
`435`	`435`	`assert top_k_num == topk_weights.shape[1]`
`436`	`436`	`device = qcurr_hidden_states.device`
Original file line number	Diff line number	Diff line change
`@@ -68,8 +68,7 @@ def apply(`
`68`	`68`	`f"But got output={output.size()}, "`
`69`	`69`	`f"used_expert_output={fused_expert_output.size()}"`
`70`	`70`	`)`
`71`		`- output.copy_(fused_expert_output, non_blocking=True)`
`72`		`- return output`
	`71`	`+ return fused_expert_output`
`73`	`72`
`74`	`73`
`75`	`74`	`class TopKWeightAndReduceContiguous(mk.TopKWeightAndReduce):`