ROCm · zhuyuhua-v · Sep 19, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -159,6 +159,10 @@
     structured outputs, speculative decoding, and pipeline parallelism.
     """
 
+    split_prefill_from_chunk: bool = False
+    """Whether to split the prefill request into pure prefill and chunked prefill in a single
+    batch."""
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -239,16 +239,11 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         if selected_backend is None or selected_backend == _Backend.FLASH_ATTN:
             selected_backend = _Backend.ROCM_FLASH
 
-        if envs.VLLM_USE_V1:
-            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA \
-                and on_gfx9():
-                logger.info("Using Flash Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "rocm_aiter_fa.AiterFlashAttentionBackend")
-            else:
-                logger.info("Using Triton Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends."
-                        "triton_attn.TritonAttentionBackend")
+            if envs.VLLM_USE_V1:
+                from vllm.v1.attention.backends.rocm_mha_backend_helper import get_rocm_mha_backend_selection
+                backend_class_path, _ = get_rocm_mha_backend_selection()
+                if backend_class_path:
+                    return backend_class_path
         if selected_backend == _Backend.ROCM_FLASH:
             if not cls.has_device_capability(90):
                 # not Instinct series GPUs.
@@ -346,6 +341,10 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
+        if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA:
+            # enable the request reorder if we are using AITER MHA for calculation
+            vllm_config.scheduler_config.split_prefill_from_chunk = True
+
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
         if model_arch in _ROCM_UNSUPPORTED_MODELS: