vllm-project · tjtanaa · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 8, 2025
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -78,6 +78,7 @@
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
+    VLLM_ROCM_USE_AITER_FP8_TKW1_MOE: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -553,6 +554,14 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     (os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "false").lower() in
      ("true", "1")),
 
+    # TODO: change this back to False
+    # Whether to use aiter custom topk weight multiplication first
+    # channel scaled moe kernel. (This is for Llama-4)
+    # By default this is disabled.
+    "VLLM_ROCM_USE_AITER_FP8_TKW1_MOE":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP8_TKW1_MOE", "false").lower() in
+             ("true", "1")),
+
     # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_RMSNORM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in

@@ -25,7 +25,7 @@
 
 from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
                                    rocm_aiter_fused_experts,
-                                   rocm_aiter_topk_softmax)
+                                   rocm_aiter_topk_softmax_wrapper)
 
 logger = init_logger(__name__)
 
@@ -842,7 +842,7 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
 
 def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
     if is_rocm_aiter_moe_enabled():
-        return rocm_aiter_topk_softmax
+        return rocm_aiter_topk_softmax_wrapper
     return vllm_topk_softmax