From a2d5ef088a55428a853c3204b2cc757def8ae9d0 Mon Sep 17 00:00:00 2001 From: FENP <32334296+FENP@users.noreply.github.com> Date: Thu, 16 Oct 2025 21:55:26 +0800 Subject: [PATCH] bugfix: set reorder_batch_threshold back to 1 when using FlashMLA and enable DCP Signed-off-by: FENP <32334296+FENP@users.noreply.github.com> --- vllm/v1/attention/backends/mla/common.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 1d4e3e4cfe22..38a4d5df1b03 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -558,6 +558,19 @@ def __init__( self.dcp_world_size = 1 self.dcp_rank = 0 + if ( + self.dcp_world_size > 1 + and self.__class__.reorder_batch_threshold > 1 + and self.__class__.__name__ != "FlashAttnMLAMetadataBuilder" + ): + logger.warning_once( + "DCP is enabled but not FlashAttnMLA is used. " + "Set query_len_support back to SINGLE_ONLY " + "and reorder_batch_threshold back to 1." + ) + self.__class__.query_len_support = QueryLenSupport.SINGLE_ONLY + self.__class__.reorder_batch_threshold = 1 + # Don't try to access the runner on AMD if self.aot_schedule: self.page_size = self.kv_cache_spec.block_size