Merge branch 'aiter_integration_final' into aiter_integration_ck_fused_moe

Zzz9990 · Zzz9990 · commit df5f2974ab01 · 2025-03-06T13:16:43.000+08:00
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
@@ -116,7 +116,7 @@ ENV TOKENIZERS_PARALLELISM=false
 ENV HIP_FORCE_DEV_KERNARG=1
 
 # Enable Aiter. Make sure this only exists on the aiter branch.
-ENV VLLM_USE_AITER=1
+# ENV VLLM_USE_AITER=1
 
 CMD ["/bin/bash"]
 
diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base
@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="485b4b28"
+ARG AITER_BRANCH="41297e56"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -118,17 +118,14 @@ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
 FROM base AS build_aiter
 ARG AITER_BRANCH
 ARG AITER_REPO
-COPY requirements-rocm.txt /app
-COPY requirements-common.txt /app
-RUN pip install -r requirements-rocm.txt
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN git clone --recursive ${AITER_REPO}
 RUN cd aiter \
     && git checkout ${AITER_BRANCH} \
     && git submodule update --init --recursive \
-    && pip install -r requirements.txt \
-    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+    && pip install -r requirements.txt
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
 
 FROM base AS final
diff --git a/csrc/rocm/custom_kernels.cu b/csrc/rocm/custom_kernels.cu
@@ -1715,7 +1715,7 @@ void wvSpltKQ_(void* in_a, void* in_b, void* out_c, void* scale_a,
     dim3 block(64, _WvPrGrp);                                                 \
     if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) {                \
       int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
-      wvSpltKQ_hf_sml_<64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N>                 \
+      wvSpltKQ_hf_sml_<64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>                  \
           <<<grid, block, 0, stream>>>(K_in, Kp_in, M_in, af4, bf4, c, s_a,   \
                                        s_b, __wvPrGrp, Otp_in, CuCount);      \
     } else {                                                                  \
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -12,8 +12,9 @@
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (CommonAttentionState,
                                            CommonMetadataBuilder)
+from vllm.utils import aiter_paged_attn_enabled
 
-if envs.VLLM_USE_AITER_PAGED_ATTN:
+if aiter_paged_attn_enabled():
     from vllm.attention.ops.paged_attn_aiter import (PagedAttention,
                                                      PagedAttentionMetadata)
 else:
@@ -616,7 +617,7 @@ def forward(
         else:
             assert value is None
 
-        if (envs.VLLM_USE_AITER_PAGED_ATTN and kv_cache.dtype.itemsize == 1
+        if (aiter_paged_attn_enabled() and kv_cache.dtype.itemsize == 1
                 and not self.aiter_kv_scales_initialized
                 and kv_cache.shape != torch.Size([0])):
             num_blocks = kv_cache.shape[1]
@@ -804,7 +805,7 @@ def forward(
             use_custom = _use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
                 decode_meta.max_decode_seq_len)
-            if envs.VLLM_USE_AITER_PAGED_ATTN:
+            if aiter_paged_attn_enabled():
                 out = output[num_prefill_tokens:]
                 PagedAttention.forward_decode(
                     decode_query,
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -304,7 +304,8 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # use ater ck fused moe op if ater ops are enabled
     "VLLM_USE_AITER_2STAGE_MOE":
-    lambda: (os.getenv("VLLM_USE_AITER_2STAGE_MOE", "True").lower() in ("true", "1")),
+    lambda: (os.getenv("VLLM_USE_AITER_2STAGE_MOE", "True").lower() in
+             ("true", "1")),
 
     # use ater paged attn op if ater ops are enabled
     "VLLM_USE_AITER_PAGED_ATTN":
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -15,9 +15,9 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils import aiter_moe_enabled, direct_register_custom_op
 
-if envs.VLLM_USE_AITER_MOE:
+if aiter_moe_enabled():
     import aiter
 
 logger = init_logger(__name__)
@@ -950,7 +950,7 @@ def fused_topk(
                                         dtype=torch.int32,
                                         device=hidden_states.device)
 
-    if envs.VLLM_USE_AITER_MOE:
+    if aiter_moe_enabled():
         aiter.topk_softmax(topk_weights, topk_ids, token_expert_indicies,
                            gating_output.float(), renormalize)
     else:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -11,16 +11,16 @@
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
-from vllm.envs import VLLM_USE_AITER_MOE
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
+from vllm.utils import aiter_moe_enabled
 
-if VLLM_USE_AITER_MOE:
+if aiter_moe_enabled():
     from aiter import ck_moe
     from aiter.ops.shuffle import shuffle_weight
 
@@ -101,7 +101,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
-        if envs.VLLM_USE_AITER_MOE:
+        if aiter_moe_enabled():
             layer.w13_weight = torch.nn.Parameter(shuffle_weight(
                 layer.w13_weight.data),
                                                   requires_grad=False)
@@ -189,7 +189,7 @@ def forward_cuda(
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
 
-        if VLLM_USE_AITER_MOE:
+        if aiter_moe_enabled():
             return ck_moe(hidden_states=x,
                           w1=layer.w13_weight,
                           w2=layer.w2_weight,
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
@@ -5,10 +5,10 @@
 import torch
 import torch.nn as nn
 
-from vllm.envs import VLLM_USE_AITER_NORM
 from vllm.model_executor.custom_op import CustomOp
+from vllm.utils import aiter_norm_enabled
 
-if VLLM_USE_AITER_NORM:
+if aiter_norm_enabled():
     import aiter
 
 
@@ -100,7 +100,7 @@ def forward_cuda(
             return out
 
         if residual is not None:
-            if VLLM_USE_AITER_NORM:
+            if aiter_norm_enabled():
                 aiter.rmsnorm2d_fwd_with_add(
                     x,
                     x,
@@ -118,7 +118,7 @@ def forward_cuda(
                 )
             return x, residual
 
-        if VLLM_USE_AITER_NORM:
+        if aiter_norm_enabled():
             out = aiter.rms_norm(x, self.weight.data, self.variance_epsilon)
         else:
             out = torch.empty_like(x)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -7,7 +7,6 @@
 import torch
 from torch.nn.parameter import Parameter, UninitializedParameter
 
-from vllm import envs
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -16,8 +15,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.utils import aiter_linear_enabled
 
-if envs.VLLM_USE_AITER_LINEAR:
+if aiter_linear_enabled():
     from aiter.tuned_gemm import tgemm
 else:
     from vllm.model_executor.layers.tuned_gemm import tgemm
@@ -256,7 +256,7 @@ def forward(
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
         if type(self.quant_method
-                ) is UnquantizedLinearMethod and envs.VLLM_USE_AITER_LINEAR:
+                ) is UnquantizedLinearMethod and aiter_linear_enabled():
             output = tgemm.mm(x, self.weight, bias, self.out_dtype)
         else:
             output = self.quant_method.apply(self, x, bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -32,11 +32,11 @@
                                            PerTensorScaleParameter)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import is_navi
+from vllm.utils import aiter_2stage_moe_enabled, aiter_moe_enabled, is_navi
 
-if envs.VLLM_USE_AITER_MOE:
+if aiter_moe_enabled():
     from aiter.fused_moe_bf16_asm import asm_moe
-    if envs.VLLM_USE_AITER_2STAGE_MOE:
+    if aiter_2stage_moe_enabled():
         from aiter.fused_moe_bf16_asm import ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
 
@@ -621,7 +621,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                   requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                  requires_grad=False)
-            if envs.VLLM_USE_AITER_MOE:
+            if aiter_moe_enabled():
                 w13_scales = layer.w13_weight_scale.data.unsqueeze(
                     -1).unsqueeze(-1).expand(
                         (-1, layer.w13_weight.shape[1], -1))
@@ -632,13 +632,13 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 layer.w13_weight_scale = torch.nn.Parameter(
                     w13_scales.contiguous(), requires_grad=False)
 
-                if envs.VLLM_USE_AITER_2STAGE_MOE:
-                    layer.w13_weight = torch.nn.Parameter(
-                        shuffle_weight(layer.w13_weight, layout=(32, 32)),
-                                                        requires_grad=False)
-                    layer.w2_weight = torch.nn.Parameter(
-                        shuffle_weight(layer.w2_weight, layout=(32, 32)),
-                                                        requires_grad=False)
+                if aiter_2stage_moe_enabled():
+                    layer.w13_weight = torch.nn.Parameter(shuffle_weight(
+                        layer.w13_weight, layout=(32, 32)),
+                                                          requires_grad=False)
+                    layer.w2_weight = torch.nn.Parameter(shuffle_weight(
+                        layer.w2_weight, layout=(32, 32)),
+                                                         requires_grad=False)
                 else:
                     layer.w13_weight = torch.nn.Parameter(shuffle_weight(
                         layer.w13_weight),
@@ -715,32 +715,31 @@ def process_weights_after_loading(self, layer: Module) -> None:
                             dq_weight, max_w13_scales[expert_id])
                     start += shard_size
 
-            if envs.VLLM_USE_AITER_MOE:
-                if envs.VLLM_USE_AITER_2STAGE_MOE:
+            if aiter_moe_enabled():
+                if aiter_2stage_moe_enabled():
                     max_w13_scales = max_w13_scales.unsqueeze(-1)
                     w2_scales = layer.w2_weight_scale.data.unsqueeze(-1)
+                    layer.w13_weight = torch.nn.Parameter(shuffle_weight(
+                        layer.w13_weight, layout=(32, 32)),
+                                                          requires_grad=False)
+                    layer.w2_weight = torch.nn.Parameter(shuffle_weight(
+                        layer.w2_weight, layout=(32, 32)),
+                                                         requires_grad=False)
                 else:
                     max_w13_scales = max_w13_scales.unsqueeze(-1).unsqueeze(
                         -1).expand((-1, layer.w13_weight.shape[1], -1))
-                    w2_scales = layer.w2_weight_scale.data.unsqueeze(-1).unsqueeze(
-                        -1).expand((-1, layer.w2_weight.shape[1], -1))
-
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_scales.contiguous(), requires_grad=False)
-                if envs.VLLM_USE_AITER_2STAGE_MOE:
-                    layer.w13_weight = torch.nn.Parameter(
-                        shuffle_weight(layer.w13_weight, layout=(32, 32)),
-                                                        requires_grad=False)
-                    layer.w2_weight = torch.nn.Parameter(
-                        shuffle_weight(layer.w2_weight, layout=(32, 32)),
-                                                        requires_grad=False)
-                else:
+                    w2_scales = layer.w2_weight_scale.data.unsqueeze(
+                        -1).unsqueeze(-1).expand(
+                            (-1, layer.w2_weight.shape[1], -1))
                     layer.w13_weight = torch.nn.Parameter(shuffle_weight(
                         layer.w13_weight),
                                                           requires_grad=False)
                     layer.w2_weight = torch.nn.Parameter(shuffle_weight(
                         layer.w2_weight),
                                                          requires_grad=False)
+
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
             layer.w13_weight_scale = torch.nn.Parameter(
                 max_w13_scales.contiguous(), requires_grad=False)
             return
@@ -776,15 +775,15 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
         )
 
-        if envs.VLLM_USE_AITER_MOE:
-            if envs.VLLM_USE_AITER_2STAGE_MOE:
+        if aiter_moe_enabled():
+            if aiter_2stage_moe_enabled():
                 return ck_moe_2stages(a1=x,
-                    w1=layer.w13_weight,
-                    w2=layer.w2_weight,
-                    topk_weight=topk_weights,
-                    topk_ids=topk_ids,
-                    fc1_scale=layer.w13_weight_scale,
-                    fc2_scale=layer.w2_weight_scale)
+                                      w1=layer.w13_weight,
+                                      w2=layer.w2_weight,
+                                      topk_weight=topk_weights,
+                                      topk_ids=topk_ids,
+                                      fc1_scale=layer.w13_weight_scale,
+                                      fc2_scale=layer.w2_weight_scale)
 
             return asm_moe(
                 hidden_states=x,
diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py
@@ -10,9 +10,9 @@
 from vllm import _custom_ops as ops
 from vllm import envs
 from vllm.platforms import current_platform
-from vllm.utils import is_mi250, is_navi
+from vllm.utils import aiter_linear_enabled, is_mi250, is_navi
 
-if envs.VLLM_USE_AITER_LINEAR:
+if aiter_linear_enabled():
     from aiter.tuned_gemm import tgemm as aiter_tgemm
 
 support_tuned_gemms = False
@@ -105,7 +105,7 @@ def scaled_mm(
         scale_b: torch.Tensor,
         bias: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        if envs.VLLM_USE_AITER_LINEAR:
+        if aiter_linear_enabled():
             return aiter_tgemm.mm(inp,
                                   weight.t(),
                                   otype=out_dtype,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -28,7 +28,6 @@
 from torch import nn
 from transformers import MixtralConfig
 
-from vllm import envs
 from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -48,6 +47,7 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.utils import aiter_linear_enabled
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
@@ -85,7 +85,7 @@ def __init__(self,
             params_dtype=params_dtype,
             quant_config=None,
             prefix=f"{prefix}.gate",
-            out_dtype=torch.float32 if envs.VLLM_USE_AITER_LINEAR else None,
+            out_dtype=torch.float32 if aiter_linear_enabled() else None,
         )
 
         self.experts = FusedMoE(num_experts=num_experts,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -225,12 +225,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                                         or envs.VLLM_USE_AITER_MOE
                                         or envs.VLLM_USE_AITER_NORM
                                         or envs.VLLM_USE_AITER_PAGED_ATTN):
-            logger.info("Aiter main switch - VLLM_USE_AITER is not set,"
+            logger.info("Aiter main switch (VLLM_USE_AITER) is not set."
                         " Disabling individual Aiter components")
-            envs.VLLM_USE_AITER_LINEAR = False
-            envs.VLLM_USE_AITER_MOE = False
-            envs.VLLM_USE_AITER_NORM = False
-            envs.VLLM_USE_AITER_PAGED_ATTN = False
 
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
diff --git a/vllm/utils.py b/vllm/utils.py