Add aiter tkw1 kernel for fp8

kliuae · kliuae · commit 6659b9904e40 · 2025-04-16T17:55:17.000Z
Signed-off-by: kliuae &lt;kuanfu.liu@embeddedllm.com&gt;
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="8970b25b"
+ARG AITER_BRANCH="5a77249"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -78,6 +78,7 @@
     VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
+    VLLM_ROCM_USE_AITER_FP8_CHANNEL_SCALED_MOE: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -553,6 +554,13 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     (os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "false").lower() in
      ("true", "1")),
 
+    # Whether to use aiter channel scaled moe kernel.
+    # By default this is disabled.
+    "VLLM_ROCM_USE_AITER_FP8_CHANNEL_SCALED_MOE":
+    lambda:
+    (os.getenv("VLLM_ROCM_USE_AITER_FP8_CHANNEL_SCALED_MOE", "false").lower() in
+     ("true", "1")),
+
     # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_RMSNORM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -18,6 +18,50 @@ def is_rocm_aiter_block_scaled_moe_enabled() -> bool:
         envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
 
 
+def is_rocm_aiter_channel_scaled_moe_enabled() -> bool:
+    return is_rocm_aiter_moe_enabled() and \
+        envs.VLLM_ROCM_USE_AITER_FP8_CHANNEL_SCALED_MOE
+
+
+def asm_moe_tkw1_impl(sorted_ids: torch.Tensor,
+                      sorted_weights: torch.Tensor,
+                      sorted_expert_ids: torch.Tensor,
+                      num_valid_ids: torch.Tensor,
+                      moe_buf: torch.Tensor,
+                      hidden_states: torch.Tensor,
+                      w1: torch.Tensor,
+                      w2: torch.Tensor,
+                      topk_weight: torch.Tensor,
+                      topk_ids: torch.Tensor,
+                      fc1_scale: Optional[torch.Tensor] = None,
+                      fc2_scale: Optional[torch.Tensor] = None,
+                      fc1_smooth_scale: Optional[torch.Tensor] = None,
+                      fc2_smooth_scale: Optional[torch.Tensor] = None,
+                      activation_str: str = "silu") -> None:
+    import aiter as rocm_aiter
+
+    if activation_str == "silu":
+        activation = rocm_aiter.ActivationType.Silu
+    elif activation_str == "gelu":
+        activation = rocm_aiter.ActivationType.Gelu
+    else:
+        activation = rocm_aiter.ActivationType.Silu
+
+    E, model_dim, _ = w2.shape
+    M, topk = topk_ids.shape
+    device = topk_ids.device
+
+    a8_type = (w1.dtype if w1.dtype != torch.int32 and w1.dtype != torch.uint32
+               else torch.float8_e4m3fnuz)
+    a8 = torch.empty((M, model_dim), dtype=a8_type, device=device)
+    a8_scale = torch.empty(M, dtype=torch.float, device=device)
+    rocm_aiter.dynamic_per_token_scaled_fp8_quant(a8, hidden_states, a8_scale)
+    fmoe_func = rocm_aiter.fmoe_g1u1_tkw1
+    fmoe_func(moe_buf, a8, w1, w2, sorted_ids, sorted_weights,
+              sorted_expert_ids, num_valid_ids, topk, a8_scale, fc1_scale,
+              fc2_scale, fc2_smooth_scale, activation)
+
+
 def rocm_aiter_fused_experts(
         *,
         hidden_states: torch.Tensor,
@@ -26,10 +70,12 @@ def rocm_aiter_fused_experts(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         use_fp8_w8a8: bool = False,
+        apply_router_weight_on_input: bool = False,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None,
         expert_mask: Optional[torch.Tensor] = None,
+        activation: str = "silu",
         **kwagrs  # Ignore additional keyword arguments
 ) -> torch.Tensor:
 
@@ -38,8 +84,22 @@ def rocm_aiter_fused_experts(
 
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
         per_token_group_quant_fp8)
+    
+    if apply_router_weight_on_input:
+        _, topk = topk_weights.shape
+        assert (
+            topk == 1
+        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+
+        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
+
+    if is_rocm_aiter_block_scaled_moe_enabled() and use_fp8_w8a8:
+        assert not apply_router_weight_on_input, (
+            "apply_router_weight_on_input is not supported for block scaled moe"
+        )
 
-    if envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE and use_fp8_w8a8:
         assert w1_scale is not None
         assert w2_scale is not None
 
@@ -88,8 +148,53 @@ def rocm_aiter_fused_experts(
             None,
         )
         return out_asm
+    
+    elif is_rocm_aiter_channel_scaled_moe_enabled() and use_fp8_w8a8:
+        topk_weights = topk_weights.to(torch.float32)
+        topk_ids = topk_ids.to(torch.int32)
+
+        E, model_dim, _ = w2.shape
+        dtype = hidden_states.dtype
+
+        if expert_mask is not None:
+            E = expert_mask.numel()
+
+        (
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            out_asm,
+        ) = rocm_aiter_asm_fmoe.moe_sorting_ck(topk_ids,
+                                               topk_weights,
+                                               E,
+                                               model_dim,
+                                               dtype,
+                                               expert_mask=expert_mask)
+        
+        asm_moe_tkw1_impl(
+            sorted_ids=sorted_token_ids,
+            sorted_weights=sorted_weight_buf,
+            sorted_expert_ids=sorted_expert_ids,
+            num_valid_ids=num_valid_ids,
+            moe_buf=out_asm,
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weight=topk_weights,
+            topk_ids=topk_ids,
+            fc1_scale=w1_scale,
+            fc2_scale=w2_scale,
+            fc1_smooth_scale=None,
+            fc2_smooth_scale=None,
+            activation_str=activation)
+
+        return out_asm
 
     elif use_fp8_w8a8:
+        assert not apply_router_weight_on_input, (
+            "apply_router_weight_on_input is not supported for fp8_w8a8")
+        
         return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
                                            w1=w1,
                                            w2=w2,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -14,6 +14,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    is_rocm_aiter_channel_scaled_moe_enabled, rocm_aiter_fused_experts,
+    shuffle_weights)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     WNA16_SUPPORTED_BITS)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
@@ -36,6 +39,7 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsW8A8Fp8MoECutlassMethod",
     "CompressedTensorsWNA16MarlinMoEMethod",
     "CompressedTensorsWNA16MoEMethod",
+    "CompressedTensorsW8A8Fp8MoEAiterMethod",
 ]
 
 
@@ -70,6 +74,8 @@ def get_moe_method(
               and layer.activation == "silu" and layer.expert_map is None):
             return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
+            if is_rocm_aiter_channel_scaled_moe_enabled():
+                return CompressedTensorsW8A8Fp8MoEAiterMethod(quant_config)
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
         else:
             raise RuntimeError(
@@ -302,6 +308,69 @@ def apply(
             a2_scale=layer.w2_input_scale)
 
 
+class CompressedTensorsW8A8Fp8MoEAiterMethod(CompressedTensorsW8A8Fp8MoEMethod
+                                             ):
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+        # reshaping weights is required for aiter moe kernel.
+        shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data,
+                                                    layer.w2_weight.data)
+
+        layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                              requires_grad=False)
+        layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        apply_router_weight_on_input: bool = False,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+
+        assert activation in ["silu", "gelu"]
+        assert global_num_experts == layer.w13_weight.shape[0]
+        assert expert_map is None
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        return rocm_aiter_fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            use_fp8_w8a8=True,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            activation=activation,
+            expert_map=expert_map,
+            apply_router_weight_on_input=apply_router_weight_on_input)
+
+
 class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
 
     def __init__(