cleanup

bnellnm · bnellnm · commit 2155672fc2e3 · 2025-09-18T15:56:01.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -284,6 +284,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -128,6 +128,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
@@ -138,5 +139,5 @@ def apply(
         assert experts is not None
         experts.apply(output, hidden_states, w1, w2, topk_weights, topk_ids,
                       activation, global_num_experts, expert_map, a1q_scale,
-                      workspace13, workspace2, expert_tokens_meta,
+                      a2_scale, workspace13, workspace2, expert_tokens_meta,
                       apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -241,6 +241,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
@@ -262,7 +263,7 @@ def apply(
         run_cutlass_moe_fp8(
             output, hidden_states, w1, w2, topk_ids, activation_callable,
             global_num_experts, expert_map, self.w1_scale, self.w2_scale,
-            a1q_scale, self.a2_scale, self.ab_strides1, self.ab_strides2,
+            a1q_scale, a2_scale, self.ab_strides1, self.ab_strides2,
             self.c_strides1, self.c_strides2, workspace13, workspace2,
             expert_num_tokens,
             self.out_dtype if self.out_dtype is not None else in_dtype,
@@ -703,6 +704,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],  # unused
+        a2_scale: Optional[torch.Tensor],  # unused
         workspace13: Optional[torch.Tensor],
         workspace2: Optional[torch.Tensor],
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -213,13 +213,14 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
     ):
         assert a1q_scale is not None
-        assert self.a2_scale is None
+        assert a2_scale is None
         assert self.block_shape is not None
         assert self.w1_scale is not None
         assert self.w2_scale is not None
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -125,6 +125,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: Optional[torch.Tensor],
         workspace2: Optional[torch.Tensor],
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -686,6 +686,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
@@ -875,6 +876,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
@@ -966,7 +968,7 @@ def apply(
                         intermediate_cache1.view(-1, N))
 
         qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input(
-            intermediate_cache2, self.a2_scale, max_num_tokens, E, N,
+            intermediate_cache2, a2_scale, max_num_tokens, E, N,
             expert_num_tokens, self.quant_dtype, self.per_act_token_quant,
             self.block_shape)
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1597,6 +1597,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
@@ -1689,7 +1690,7 @@ def apply(
         a2q_scale: Optional[torch.Tensor] = None
 
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
-            intermediate_cache2, self.a2_scale, self.quant_dtype,
+            intermediate_cache2, a2_scale, self.quant_dtype,
             self.per_act_token_quant, self.block_shape)
 
         invoke_fused_moe_kernel(
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -184,6 +184,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -85,7 +85,7 @@ def _moe_problem_size(
         M = a1.size(0)
     else:
         assert a1.dim() == 3
-        #assert a1.size(0) == E, f"{a1.size(0)} == {E}"
+        assert a1.size(0) == E, f"{a1.size(0)} == {E}"
         M = a1.size(1)  # This is max_num_tokens
 
     assert topk_ids.dim() == 2
@@ -536,11 +536,12 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-    ):
+    ) -> None:
         """
         This function computes the intermediate result of a Mixture of Experts
         (MoE) layer using two sets of weights, w1 and w2.
@@ -674,22 +675,22 @@ def _allocate_buffers(
 
         # We can reuse the memory between cache1 and cache3 because by the
         # time we need cache3, we're done with cache1.
-        workspace13 = torch.zeros(prod(workspace13_shape),
-                                  device=device,
-                                  dtype=workspace_dtype)
-        workspace2 = torch.zeros(prod(workspace2_shape),
-                                 device=device,
-                                 dtype=workspace_dtype)
+        workspace13 = self.workspace13_buffer.get(workspace13_shape,
+                                                  device=device,
+                                                  dtype=workspace_dtype)
+        workspace2 = self.workspace2_buffer.get(workspace2_shape,
+                                                device=device,
+                                                dtype=workspace_dtype)
 
         # Construct the entire output that can then be processed in chunks.
         if num_chunks == 1 and prod(workspace13_shape) >= prod(
                 fused_out_shape):
             # Reuse workspace13 for the output in the non-chunked case.
             fused_out = _resize_cache(workspace13, fused_out_shape)
         else:
-            fused_out = torch.empty(fused_out_shape,
-                                    device=device,
-                                    dtype=out_dtype)
+            fused_out = self.fused_out_buffer.get(fused_out_shape,
+                                                  device=device,
+                                                  dtype=out_dtype)
 
         return workspace13, workspace2, fused_out
 
@@ -785,7 +786,10 @@ def forward(
         - torch.Tensor: The output tensor after applying the MoE layer.
         """
 
-        output = hidden_states if inplace else torch.zeros_like(hidden_states)
+        if inplace and self.shared_experts is None:
+            output = hidden_states
+        else:
+            output = torch.zeros_like(hidden_states)
 
         local_num_experts = w1.size(0)
         if global_num_experts == -1:
@@ -799,8 +803,6 @@ def forward(
             (a1q, a1q_scale, expert_tokens_meta, _expert_topk_ids,
              _expert_topk_weights) = self.prepare_finalize.prepare(
                  hidden_states,
-                 a1_scale,
-                 a2_scale,
                  topk_weights,
                  topk_ids,
                  global_num_experts,
@@ -810,10 +812,9 @@ def forward(
              )
         else:
             # Overlap shared expert compute with all2all dispatch.
-            receiver = self.prepare_finalize.prepare_async(
+            dbo_maybe_run_recv_hook()
+            hook, receiver = self.prepare_finalize.prepare_async(
                 hidden_states,
-                a1_scale,
-                a2_scale,
                 topk_weights,
                 topk_ids,
                 global_num_experts,
@@ -838,6 +839,8 @@ def forward(
         topk_weights = (topk_weights if _expert_topk_weights is None else
                         _expert_topk_weights)
 
+        fused_out = None
+
         if a1q.numel() == 0:
             # This happens when none of the tokens from the all2all reach this
             # EP rank. Also, note that this is only relevant for CUDAGraph
@@ -853,7 +856,7 @@ def forward(
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_chunks = cdiv(M, CHUNK_SIZE)
             else:
-                CHUNK_SIZE = M #a1q.size(0)
+                CHUNK_SIZE = M  #a1q.size(0)
                 num_chunks = 1
 
             def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
@@ -892,12 +895,8 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
                     activation=activation,
                     global_num_experts=global_num_experts,
                     expert_map=expert_map,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    w1_zp=w1_zp,
-                    w2_zp=w2_zp,
                     a1q_scale=_chunk_scales(a1q_scale, s, e),
-                    a2_scale=_chunk_scales(a2_scale, e, e),
+                    a2_scale=_chunk_scales(self.fused_experts.a2_scale, e, e),
                     workspace13=workspace13,
                     workspace2=workspace2,
                     expert_tokens_meta=c_expert_tokens_meta,
@@ -918,7 +917,7 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
             if self.shared_experts is not None:
-                shared_output = self.shared_experts(a1)
+                shared_output = self.shared_experts(hidden_states)
         else:
             recv_hook = self.prepare_finalize.finalize_async(
                 output,
@@ -930,7 +929,7 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
             )
 
             if self.shared_experts is not None:
-                shared_output = self.shared_experts(a1)
+                shared_output = self.shared_experts(hidden_states)
 
             assert recv_hook is not None
             dbo_register_recv_hook(recv_hook)
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -110,6 +110,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
@@ -133,6 +134,7 @@ def apply(
             global_num_experts,
             expert_map,
             a1q_scale,
+            a2_scale,
             workspace13,
             workspace2,
             expert_tokens_meta,
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -100,6 +100,7 @@ def apply(
         global_num_experts: int,
         expert_map: Optional[torch.Tensor],
         a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
         workspace13: torch.Tensor,
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],