From 6d7840c3957f5364b484e00f5aa8b95eaf4f18bc Mon Sep 17 00:00:00 2001
From: ShaoChunLee <Shao-Chun.Lee@amd.com>
Date: Fri, 25 Jul 2025 03:58:03 +0000
Subject: [PATCH 001/233] add fused fp8 bmm

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/v1/attention/backends/mla/common.py | 39 ++++++++++++++++++++----
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index badff67656c2..a47027f25d4c 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -234,6 +234,26 @@
 except ImportError:
     flashinfer_available = False
 
+
+def dynamic_per_batched_tensor_quant(
+    x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
+):
+    DTYPE_MAX = torch.finfo(dtype).max
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
+    scale = DTYPE_MAX / amax
+    x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
+@torch.compiler.disable
+def aiter_triton_fp8_bmm_wrapper(x, w, w_s, y = None, transpose_bm = False):
+    if y is not None:
+        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(x, w, w_s, YQ=y, transpose_bm=transpose_bm)
+    else:
+        y = batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(x, w, w_s, transpose_bm = transpose_bm)
+        return y
+            
 logger = init_logger(__name__)
 
 CUDNN_WORKSPACE_SIZE = 12800
@@ -953,7 +973,8 @@ def _v_up_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
         # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
-        x = torch.bmm(x, self.W_UV)
+        x = aiter_triton_fp8_bmm_wrapper(x, self.W_V, self.W_V_scale, transpose_bm = False)
+        # x = torch.bmm(x, self.W_UV)
         # Convert from (N, B, V) to (B, N * V)
         return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
 
@@ -986,6 +1007,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
         # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
         # the bmm's in 16-bit, the extra memory overhead of this is fairly low
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+
         assert kv_b_proj_weight.shape == (
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
@@ -1002,11 +1024,16 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
 
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-
-        # Convert from (L, N, V) to (N, L, V)
-        self.W_UV = W_UV.transpose(0, 1)
-        # Convert from (L, N, P) to (N, P, L)
-        self.W_UK_T = W_UK.permute(1, 2, 0)
+        
+        W_K = W_UK.transpose(0, 1) # 16 512 128
+        W_V = W_UV.permute(1, 2, 0) # 16 128 512
+        self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(W_K, dtype=torch.float8_e4m3fnuz)
+        self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(W_V, dtype=torch.float8_e4m3fnuz)
+
+        # # Convert from (L, N, V) to (N, L, V)
+        # self.W_UV = W_UV.transpose(0, 1)
+        # # Convert from (L, N, P) to (N, P, L)
+        # self.W_UK_T = W_UK.permute(1, 2, 0)
 
     def _compute_prefill_context(
         self,

From 92e134a007462b53322f8c5344504ce548ee737c Mon Sep 17 00:00:00 2001
From: ShaoChunLee <Shao-Chun.Lee@amd.com>
Date: Sat, 26 Jul 2025 06:27:28 +0000
Subject: [PATCH 002/233] add envs

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/envs.py                             |  4 ++
 vllm/v1/attention/backends/mla/common.py | 74 +++++++++++++-----------
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 931edcfa7f1e..18f93472be84 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -158,6 +158,7 @@
     VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
+    VLLM_AITER_TRITON_FP8_BMM: bool = False
 
 
 def get_default_cache_root():
@@ -971,6 +972,9 @@ def get_vllm_port() -> Optional[int]:
     # limit will actually be zero-copy decoded.
     "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
     lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
+    
+    "VLLM_AITER_TRITON_FP8_BMM":
+    lambda: bool(int(os.getenv("VLLM_AITER_TRITON_FP8_BMM", "0"))),
 
     # If set, allow insecure serialization using pickle.
     # This is useful for environments where it is deemed safe to use the
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index a47027f25d4c..4de29f7c307b 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -234,25 +234,25 @@
 except ImportError:
     flashinfer_available = False
 
-
-def dynamic_per_batched_tensor_quant(
-    x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
-):
-    DTYPE_MAX = torch.finfo(dtype).max
-    min_val, max_val = x.aminmax()
-    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
-    scale = DTYPE_MAX / amax
-    x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
-    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
-
-from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
-@torch.compiler.disable
-def aiter_triton_fp8_bmm_wrapper(x, w, w_s, y = None, transpose_bm = False):
-    if y is not None:
-        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(x, w, w_s, YQ=y, transpose_bm=transpose_bm)
-    else:
-        y = batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(x, w, w_s, transpose_bm = transpose_bm)
-        return y
+if envs.VLLM_AITER_TRITON_FP8_BMM:
+    def dynamic_per_batched_tensor_quant(
+        x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
+    ):
+        DTYPE_MAX = torch.finfo(dtype).max
+        min_val, max_val = x.aminmax()
+        amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
+        scale = DTYPE_MAX / amax
+        x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
+        return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
+    @torch.compiler.disable
+    def aiter_triton_fp8_bmm_wrapper(x, w, w_s, group_size = 128, y = None, transpose_bm = False):
+        if y is not None:
+            batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(x, w, w_s, group_size = group_size, YQ=y, transpose_bm=transpose_bm)
+        else:
+            y = batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(x, w, w_s, group_size = group_size, transpose_bm = transpose_bm)
+            return y
             
 logger = init_logger(__name__)
 
@@ -972,11 +972,18 @@ def _run_prefill_context_chunk_cudnn(self,
     def _v_up_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
-        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
-        x = aiter_triton_fp8_bmm_wrapper(x, self.W_V, self.W_V_scale, transpose_bm = False)
-        # x = torch.bmm(x, self.W_UV)
-        # Convert from (N, B, V) to (B, N * V)
-        return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        if envs.VLLM_AITER_TRITON_FP8_BMM:
+            # Multiply + Transpose (N, B, L) x (N, L, V) -> (N, B, V) -> (B, N, V)
+            x = aiter_triton_fp8_bmm_wrapper(x, self.W_V, self.W_V_scale, group_size = 256, transpose_bm = True)
+            # Convert from (B, N, V) to (B, N * V)
+            x = x.reshape(-1, self.num_heads * self.v_head_dim)
+        else:
+            # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+            x = torch.bmm(x, self.W_UV)
+            # Convert from (N, B, V) to (B, N * V)
+            x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        return x
+
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
 
@@ -1025,15 +1032,16 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
         
-        W_K = W_UK.transpose(0, 1) # 16 512 128
-        W_V = W_UV.permute(1, 2, 0) # 16 128 512
-        self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(W_K, dtype=torch.float8_e4m3fnuz)
-        self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(W_V, dtype=torch.float8_e4m3fnuz)
-
-        # # Convert from (L, N, V) to (N, L, V)
-        # self.W_UV = W_UV.transpose(0, 1)
-        # # Convert from (L, N, P) to (N, P, L)
-        # self.W_UK_T = W_UK.permute(1, 2, 0)
+        if envs.VLLM_AITER_TRITON_FP8_BMM:
+            W_K = W_UK.transpose(0, 1) # 16 512 128
+            W_V = W_UV.permute(1, 2, 0) # 16 128 512
+            self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(W_K, dtype=torch.float8_e4m3fnuz)
+            self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(W_V, dtype=torch.float8_e4m3fnuz)
+        else:
+            # Convert from (L, N, V) to (N, L, V)
+            self.W_UV = W_UV.transpose(0, 1)
+            # Convert from (L, N, P) to (N, P, L)
+            self.W_UK_T = W_UK.permute(1, 2, 0)
 
     def _compute_prefill_context(
         self,

From 9433b841b46e682318bfbff789fbbfade6edc08d Mon Sep 17 00:00:00 2001
From: Divakar Verma <divakar.verma@amd.com>
Date: Fri, 8 Aug 2025 02:03:20 +0000
Subject: [PATCH 003/233] api fix for upstream compatibility

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/v1/attention/backends/mla/common.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 4de29f7c307b..5589983f2235 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1234,10 +1234,15 @@ def forward(
                 [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
             # Convert from (B, N, P) to (N, B, P)
             decode_q_nope = decode_q_nope.transpose(0, 1)
-            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
-            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
-            # Convert from (N, B, L) to (B, N, L)
-            decode_ql_nope = decode_ql_nope.transpose(0, 1)
+
+            if envs.VLLM_AITER_TRITON_FP8_BMM:
+                # Multiply + Transpose (N, B, P) x (N, P, L) -> (N, B, L) -> (B, N, L)
+                decode_ql_nope = aiter_triton_fp8_bmm_wrapper(decode_q_nope, self.W_K, self.W_K_scale, group_size = 128, transpose_bm = True)
+            else:
+                # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+                decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
+                # Convert from (N, B, L) to (B, N, L)
+                decode_ql_nope = decode_ql_nope.transpose(0, 1)
 
             output[:num_decode_tokens] = self._forward_decode(
                 decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)

From 245f2eb15025751cf2ca515b43b4e72d664f99e2 Mon Sep 17 00:00:00 2001
From: Divakar Verma <divakar.verma@amd.com>
Date: Tue, 12 Aug 2025 16:32:53 +0000
Subject: [PATCH 004/233] improve env switch. reformat lint

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/envs.py                             | 11 ++-
 vllm/v1/attention/backends/mla/common.py | 97 ++++++++++++++++++------
 2 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 18f93472be84..c06a3c4ee1c6 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -97,6 +97,7 @@
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_AITER_MHA: bool = True
+    VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -158,7 +159,6 @@
     VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
-    VLLM_AITER_TRITON_FP8_BMM: bool = False
 
 
 def get_default_cache_root():
@@ -750,6 +750,12 @@ def get_vllm_port() -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in
              ("true", "1")),
 
+    # Whether to use aiter triton fp8 bmm kernel
+    # By default it enabled.
+    "VLLM_ROCM_USE_AITER_FP8BMM":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in
+             ("true", "1")),
+
     # use rocm skinny gemms
     "VLLM_ROCM_USE_SKINNY_GEMM":
     lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
@@ -972,9 +978,6 @@ def get_vllm_port() -> Optional[int]:
     # limit will actually be zero-copy decoded.
     "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
     lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
-    
-    "VLLM_AITER_TRITON_FP8_BMM":
-    lambda: bool(int(os.getenv("VLLM_AITER_TRITON_FP8_BMM", "0"))),
 
     # If set, allow insecure serialization using pickle.
     # This is useful for environments where it is deemed safe to use the
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 5589983f2235..09d4148991c2 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -234,10 +234,32 @@
 except ImportError:
     flashinfer_available = False
 
-if envs.VLLM_AITER_TRITON_FP8_BMM:
+
+def is_rocm_aiter_fp8bmm_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_FP8BMM \
+        and envs.VLLM_ROCM_USE_AITER
+
+
+if is_rocm_aiter_fp8bmm_enabled():
+    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501
+        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_fp8_bmm)
+
+    def aiter_triton_fp8_bmm_wrapper(x,
+                                     w,
+                                     w_s,
+                                     group_size=128,
+                                     y=None,
+                                     transpose_bm=False):
+        return aiter_fp8_bmm(x,
+                             w,
+                             w_s,
+                             group_size=group_size,
+                             YQ=y,
+                             transpose_bm=transpose_bm)
+
     def dynamic_per_batched_tensor_quant(
-        x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn
-    ):
+            x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn):
         DTYPE_MAX = torch.finfo(dtype).max
         min_val, max_val = x.aminmax()
         amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
@@ -245,15 +267,7 @@ def dynamic_per_batched_tensor_quant(
         x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
         return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
 
-    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
-    @torch.compiler.disable
-    def aiter_triton_fp8_bmm_wrapper(x, w, w_s, group_size = 128, y = None, transpose_bm = False):
-        if y is not None:
-            batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(x, w, w_s, group_size = group_size, YQ=y, transpose_bm=transpose_bm)
-        else:
-            y = batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant(x, w, w_s, group_size = group_size, transpose_bm = transpose_bm)
-            return y
-            
+
 logger = init_logger(__name__)
 
 CUDNN_WORKSPACE_SIZE = 12800
@@ -972,9 +986,13 @@ def _run_prefill_context_chunk_cudnn(self,
     def _v_up_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
-        if envs.VLLM_AITER_TRITON_FP8_BMM:
-            # Multiply + Transpose (N, B, L) x (N, L, V) -> (N, B, V) -> (B, N, V)
-            x = aiter_triton_fp8_bmm_wrapper(x, self.W_V, self.W_V_scale, group_size = 256, transpose_bm = True)
+        if is_rocm_aiter_fp8bmm_enabled():
+            # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
+            x = aiter_triton_fp8_bmm_wrapper(x,
+                                             self.W_V,
+                                             self.W_V_scale,
+                                             group_size=128,
+                                             transpose_bm=True)
             # Convert from (B, N, V) to (B, N * V)
             x = x.reshape(-1, self.num_heads * self.v_head_dim)
         else:
@@ -984,7 +1002,6 @@ def _v_up_proj(self, x):
             x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
         return x
 
-
     def process_weights_after_loading(self, act_dtype: torch.dtype):
 
         def get_layer_weight(layer):
@@ -1031,12 +1048,37 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
 
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        
-        if envs.VLLM_AITER_TRITON_FP8_BMM:
-            W_K = W_UK.transpose(0, 1) # 16 512 128
-            W_V = W_UV.permute(1, 2, 0) # 16 128 512
-            self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(W_K, dtype=torch.float8_e4m3fnuz)
-            self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(W_V, dtype=torch.float8_e4m3fnuz)
+
+        if is_rocm_aiter_fp8bmm_enabled():
+            W_K = W_UK.transpose(0, 1)  # 16 512 128
+            W_V = W_UV.permute(1, 2, 0)  # 16 128 512
+            self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
+                W_K, dtype=torch.float8_e4m3fnuz)
+            self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
+                W_V, dtype=torch.float8_e4m3fnuz)
+            logger.info_once(
+                "[Aiter Triton] compiling fp8 BMM for batch sizes 1 to 128 "
+                f"W_K shape = {list(self.W_K.shape)} and "
+                f"W_V shape = {list(self.W_V.shape)}")
+            for m in range(1, 129):
+                x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]),
+                                dtype=torch.bfloat16,
+                                device=self.W_K.device)
+                aiter_triton_fp8_bmm_wrapper(x,
+                                             self.W_K,
+                                             self.W_K_scale,
+                                             group_size=128,
+                                             transpose_bm=True)
+
+                x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]),
+                                dtype=torch.bfloat16,
+                                device=self.W_V.device)
+                aiter_triton_fp8_bmm_wrapper(x,
+                                             self.W_V,
+                                             self.W_V_scale,
+                                             group_size=128,
+                                             transpose_bm=True)
+
         else:
             # Convert from (L, N, V) to (N, L, V)
             self.W_UV = W_UV.transpose(0, 1)
@@ -1235,9 +1277,14 @@ def forward(
             # Convert from (B, N, P) to (N, B, P)
             decode_q_nope = decode_q_nope.transpose(0, 1)
 
-            if envs.VLLM_AITER_TRITON_FP8_BMM:
-                # Multiply + Transpose (N, B, P) x (N, P, L) -> (N, B, L) -> (B, N, L)
-                decode_ql_nope = aiter_triton_fp8_bmm_wrapper(decode_q_nope, self.W_K, self.W_K_scale, group_size = 128, transpose_bm = True)
+            if is_rocm_aiter_fp8bmm_enabled():
+                # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
+                decode_ql_nope = aiter_triton_fp8_bmm_wrapper(
+                    decode_q_nope,
+                    self.W_K,
+                    self.W_K_scale,
+                    group_size=128,
+                    transpose_bm=True)
             else:
                 # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
                 decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)

From 6fd99d2f727ef0012d9efd5f296e2f1351cddf9b Mon Sep 17 00:00:00 2001
From: Divakar Verma <divakar.verma@amd.com>
Date: Tue, 12 Aug 2025 19:07:58 +0000
Subject: [PATCH 005/233] nit: formatting and direct aiter fxn call

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/envs.py                             |  2 +-
 vllm/v1/attention/backends/mla/common.py | 57 +++++++++---------------
 2 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index c06a3c4ee1c6..0b016dbc85d6 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -751,7 +751,7 @@ def get_vllm_port() -> Optional[int]:
              ("true", "1")),
 
     # Whether to use aiter triton fp8 bmm kernel
-    # By default it enabled.
+    # By default is enabled.
     "VLLM_ROCM_USE_AITER_FP8BMM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in
              ("true", "1")),
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 09d4148991c2..f96a977bb315 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -243,20 +243,7 @@ def is_rocm_aiter_fp8bmm_enabled() -> bool:
 
 if is_rocm_aiter_fp8bmm_enabled():
     from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501
-        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_fp8_bmm)
-
-    def aiter_triton_fp8_bmm_wrapper(x,
-                                     w,
-                                     w_s,
-                                     group_size=128,
-                                     y=None,
-                                     transpose_bm=False):
-        return aiter_fp8_bmm(x,
-                             w,
-                             w_s,
-                             group_size=group_size,
-                             YQ=y,
-                             transpose_bm=transpose_bm)
+        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm)
 
     def dynamic_per_batched_tensor_quant(
             x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn):
@@ -988,11 +975,11 @@ def _v_up_proj(self, x):
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
         if is_rocm_aiter_fp8bmm_enabled():
             # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
-            x = aiter_triton_fp8_bmm_wrapper(x,
-                                             self.W_V,
-                                             self.W_V_scale,
-                                             group_size=128,
-                                             transpose_bm=True)
+            x = aiter_triton_fp8_bmm(x,
+                                     self.W_V,
+                                     self.W_V_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
             # Convert from (B, N, V) to (B, N * V)
             x = x.reshape(-1, self.num_heads * self.v_head_dim)
         else:
@@ -1031,7 +1018,6 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
         # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
         # the bmm's in 16-bit, the extra memory overhead of this is fairly low
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
-
         assert kv_b_proj_weight.shape == (
             self.kv_lora_rank,
             self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
@@ -1064,20 +1050,20 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]),
                                 dtype=torch.bfloat16,
                                 device=self.W_K.device)
-                aiter_triton_fp8_bmm_wrapper(x,
-                                             self.W_K,
-                                             self.W_K_scale,
-                                             group_size=128,
-                                             transpose_bm=True)
+                aiter_triton_fp8_bmm(x,
+                                     self.W_K,
+                                     self.W_K_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
 
                 x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]),
                                 dtype=torch.bfloat16,
                                 device=self.W_V.device)
-                aiter_triton_fp8_bmm_wrapper(x,
-                                             self.W_V,
-                                             self.W_V_scale,
-                                             group_size=128,
-                                             transpose_bm=True)
+                aiter_triton_fp8_bmm(x,
+                                     self.W_V,
+                                     self.W_V_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
 
         else:
             # Convert from (L, N, V) to (N, L, V)
@@ -1279,12 +1265,11 @@ def forward(
 
             if is_rocm_aiter_fp8bmm_enabled():
                 # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
-                decode_ql_nope = aiter_triton_fp8_bmm_wrapper(
-                    decode_q_nope,
-                    self.W_K,
-                    self.W_K_scale,
-                    group_size=128,
-                    transpose_bm=True)
+                decode_ql_nope = aiter_triton_fp8_bmm(decode_q_nope,
+                                                      self.W_K,
+                                                      self.W_K_scale,
+                                                      group_size=128,
+                                                      transpose_bm=True)
             else:
                 # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
                 decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)

From c219220e866cf4856704f388b0fab26b585f8a09 Mon Sep 17 00:00:00 2001
From: Divakar Verma <divakar.verma@amd.com>
Date: Tue, 12 Aug 2025 20:14:28 +0000
Subject: [PATCH 006/233] fit fp8 dtype selection

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/v1/attention/backends/mla/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index f96a977bb315..40d29297ce07 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -1039,9 +1039,9 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
             W_K = W_UK.transpose(0, 1)  # 16 512 128
             W_V = W_UV.permute(1, 2, 0)  # 16 128 512
             self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
-                W_K, dtype=torch.float8_e4m3fnuz)
+                W_K, dtype=current_platform.fp8_dtype())
             self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
-                W_V, dtype=torch.float8_e4m3fnuz)
+                W_V, dtype=current_platform.fp8_dtype())
             logger.info_once(
                 "[Aiter Triton] compiling fp8 BMM for batch sizes 1 to 128 "
                 f"W_K shape = {list(self.W_K.shape)} and "

From 8017d7d72f5a6221cf7fba9282a72bc32703a15c Mon Sep 17 00:00:00 2001
From: Divakar Verma <divakar.verma@amd.com>
Date: Thu, 14 Aug 2025 11:25:36 -0500
Subject: [PATCH 007/233] rm kernel warmup

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/v1/attention/backends/mla/common.py | 28 +++---------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 40d29297ce07..85f6b56b5503 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -242,8 +242,9 @@ def is_rocm_aiter_fp8bmm_enabled() -> bool:
 
 
 if is_rocm_aiter_fp8bmm_enabled():
-    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501
-        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm)
+    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501 # isort: skip
+        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
+        as aiter_triton_fp8_bmm)
 
     def dynamic_per_batched_tensor_quant(
             x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn):
@@ -1042,29 +1043,6 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 W_K, dtype=current_platform.fp8_dtype())
             self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
                 W_V, dtype=current_platform.fp8_dtype())
-            logger.info_once(
-                "[Aiter Triton] compiling fp8 BMM for batch sizes 1 to 128 "
-                f"W_K shape = {list(self.W_K.shape)} and "
-                f"W_V shape = {list(self.W_V.shape)}")
-            for m in range(1, 129):
-                x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]),
-                                dtype=torch.bfloat16,
-                                device=self.W_K.device)
-                aiter_triton_fp8_bmm(x,
-                                     self.W_K,
-                                     self.W_K_scale,
-                                     group_size=128,
-                                     transpose_bm=True)
-
-                x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]),
-                                dtype=torch.bfloat16,
-                                device=self.W_V.device)
-                aiter_triton_fp8_bmm(x,
-                                     self.W_V,
-                                     self.W_V_scale,
-                                     group_size=128,
-                                     transpose_bm=True)
-
         else:
             # Convert from (L, N, V) to (N, L, V)
             self.W_UV = W_UV.transpose(0, 1)

From 3eed848c93a5d60816738ac51be3ebdc656132a8 Mon Sep 17 00:00:00 2001
From: Xiaozhu Meng <mxz297@gmail.com>
Date: Tue, 12 Aug 2025 12:53:36 -0700
Subject: [PATCH 008/233] [Kernel][AMD] Avoid D2H copy and cumsum kernel
 (#22683)

Signed-off-by: Xiaozhu <mxz297@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 32 +++++++++++++--------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index abe05174507f..e8bffbef4415 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -214,12 +214,14 @@ class AiterFlashAttentionMetadata:
     #                                   |-- query_len ---|
 
     num_actual_tokens: int  # Number of tokens excluding padding.
+    num_actual_kv_tokens: int
     max_query_len: int
     query_start_loc: torch.Tensor
     max_seq_len: int
     seq_lens: torch.Tensor
     slot_mapping: torch.Tensor
     block_table: torch.Tensor
+    cu_seq_lens: Optional[torch.Tensor]
 
     # For cascade attention.
     use_cascade: bool
@@ -272,6 +274,20 @@ def build(self,
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
+        if max_query_len > 1:
+            # We pre-compute cumulative seq len needed for prefill attention
+            # here to avoid recomputing it for every layer
+            cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
+                                      dtype=torch.int32,
+                                      device=seq_lens.device)
+            torch.cumsum(seq_lens,
+                         dim=0,
+                         dtype=cu_seq_lens.dtype,
+                         out=cu_seq_lens[1:])
+            num_actual_kv_tokens = int(cu_seq_lens[-1].item())
+        else:
+            cu_seq_lens = None
+            num_actual_kv_tokens = 0
 
         def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
                      max_seq_len, causal):
@@ -281,12 +297,14 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
 
         attn_metadata = AiterFlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
+            num_actual_kv_tokens=num_actual_kv_tokens,
             max_query_len=max_query_len,
             query_start_loc=query_start_loc,
             max_seq_len=max_seq_len,
             seq_lens=seq_lens,
             block_table=block_table_tensor,
             slot_mapping=slot_mapping,
+            cu_seq_lens=cu_seq_lens,
             use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
             total_tokens=self.total_tokens,
@@ -475,16 +493,6 @@ def forward(
             block_table = attn_metadata.block_table
 
             if max_seqlen_q > 1:
-
-                cu_seq_lens = torch.zeros(seqused_k.shape[0] + 1,
-                                          dtype=torch.int32,
-                                          device=query.device)
-
-                torch.cumsum(seqused_k,
-                             dim=0,
-                             dtype=cu_seq_lens.dtype,
-                             out=cu_seq_lens[1:])
-
                 torch.ops.vllm.flash_attn_varlen_func(
                     query[:num_actual_tokens],
                     key_cache,
@@ -497,10 +505,10 @@ def forward(
                     alibi_slopes=self.alibi_slopes,
                     window_size=self.sliding_window,
                     block_table=block_table,
-                    cu_seqlens_k=cu_seq_lens,
+                    cu_seqlens_k=attn_metadata.cu_seq_lens,
                     k_scale=layer._k_scale,
                     v_scale=layer._v_scale,
-                    total_tokens=attn_metadata.total_tokens,
+                    total_tokens=attn_metadata.num_actual_kv_tokens,
                 )
 
             _, num_heads, head_size = query.shape

From 380030fb2eb5c4e9bee18562c35f6ab771380899 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 12 Aug 2025 21:53:52 +0200
Subject: [PATCH 009/233] [CI][Nixl] Check kv cache layout during handshake
 (#22745)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../kv_connector/unit/test_nixl_connector.py  | 46 +++++++++++++++++++
 .../kv_connector/v1/nixl_connector.py         | 13 ++++--
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index c6739832355f..3860d7c85724 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -419,6 +419,52 @@ def test_concurrent_load_kv(
                     return
         raise TimeoutError("Took too long to complete async handshake.")
 
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper)
+    def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
+        """
+        Verify that adding a remote agent fails if kv_cache_layout differs.
+        This test is only relevant for heterogeneous TP.
+        """
+        vllm_config = create_vllm_config()
+
+        # Mock TP world size to 2 to force heterogeneous TP when
+        # remote_tp_size=1
+        with patch(
+                "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.get_tensor_model_parallel_world_size",  # noqa: E501
+                return_value=2):
+            # Initialize connector and worker (with fake NIXL wrapper)
+            connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+            connector.connector_worker = FakeNixlConnectorWorker(
+                vllm_config, connector.engine_id, hand_shake_latency=0)
+            worker = connector.connector_worker
+
+            # Minimal local registration params used by add_remote_agent
+            worker.slot_size_bytes = 4096
+            worker.block_len = worker.slot_size_bytes * worker.block_size
+            worker.num_blocks = 1
+            worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
+
+            # Metadata with different kv_cache_layout than local worker
+            mismatched_layout = "HND" if worker.kv_cache_layout != "HND" \
+                else "NHD"
+            meta = NixlAgentMetadata(
+                engine_id=FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
+                kv_caches_base_addr=[0],
+                num_blocks=1,
+                block_len=worker.block_len,
+                attn_backend_name=worker.backend_name,
+                kv_cache_layout=mismatched_layout,
+            )
+
+            # We don't check layout for homogeneous TP and MLA for now, as the
+            # whole block is moved.
+            worker.add_remote_agent(meta, remote_tp_size=2)
+            with pytest.raises(AssertionError):
+                worker.add_remote_agent(meta, remote_tp_size=1)
+
 
 # NOTE: resource cleanup in mp backend is a bit finicky, so the order in which
 # we put here is important. First run ray, it will clean up the resources, then
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index a6eeb278532e..4f51229ffbd2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -30,6 +30,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import make_zmq_path, make_zmq_socket
+from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
 
@@ -73,6 +74,7 @@ class NixlAgentMetadata(
     num_blocks: int
     block_len: int
     attn_backend_name: str
+    kv_cache_layout: str
 
 
 @dataclass
@@ -538,7 +540,9 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         attn_backend = backend_name_to_enum(self.backend_name)
         self._use_flashinfer = attn_backend == _Backend.FLASHINFER_VLLM_V1
         self._use_pallas_v1 = attn_backend == _Backend.PALLAS_VLLM_V1
+        self.kv_cache_layout = get_kv_cache_layout()
         logger.debug("Detected attention backend %s", self.backend_name)
+        logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
@@ -839,7 +843,8 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             kv_caches_base_addr=self.kv_caches_base_addr[self.engine_id],
             num_blocks=self.num_blocks,
             block_len=self.block_len,
-            attn_backend_name=self.backend_name)
+            attn_backend_name=self.backend_name,
+            kv_cache_layout=self.kv_cache_layout)
         ready_event = threading.Event()
         self._nixl_handshake_listener_t = threading.Thread(
             target=self._nixl_handshake_listener,
@@ -900,8 +905,7 @@ def add_remote_agent(self,
             self._tp_size[engine_id] = remote_tp_size
         else:
             assert self._tp_size[engine_id] == remote_tp_size
-        # We may eventually enable this after asserting equality in cache
-        # layout and close outputs.
+        # TODO We may eventually want to skip enforcing the same attn backend.
         assert nixl_agent_meta.attn_backend_name == self.backend_name
 
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
@@ -930,6 +934,9 @@ def add_remote_agent(self,
             if self._use_flashinfer:
                 # Account for joint KV in FlashInfer.
                 remote_block_size //= 2
+            if tp_ratio > 1:
+                # Heterogeneous TP expects same kv_cache_layout.
+                assert nixl_agent_meta.kv_cache_layout == self.kv_cache_layout
 
             assert nixl_agent_meta.block_len == self.block_len * tp_ratio, (
                 "Remote P worker KV layer cache must be of shape [2, N, "

From d453c1c8b92110c4e80d5b7713b61ed9049a073f Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Tue, 12 Aug 2025 12:54:42 -0700
Subject: [PATCH 010/233] Fix torch version check for SM100 mxfp4  (#22535)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d5a89655e36d..fb38fb91ead6 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -741,12 +741,14 @@ def __init__(
 
         # we padding globally so EP buffer allocation works
         if quant_config and quant_config.get_name() == "mxfp4":
-            if not is_torch_equal_or_newer("2.8.0"):
-                raise RuntimeError("Mxfp4 on hopper requires torch >= 2.8.0")
-            if current_platform.is_device_capability(
-                    90) and not has_triton_kernels():
-                raise NotImplementedError(
-                    "Triton kernels must be installed for mxfp4 on hopper")
+            if not current_platform.is_device_capability(100):
+                if not is_torch_equal_or_newer("2.8.0"):
+                    raise RuntimeError(
+                        "Mxfp4 on non-blackwell requires torch >= 2.8.0")
+                if not has_triton_kernels():
+                    raise NotImplementedError(
+                        "triton_kernels must be installed for "
+                        "mxfp4 on non-blackwell")
             if (current_platform.is_rocm()
                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):

From e56ec0d762b576be252b8508384926e7f0afcc9a Mon Sep 17 00:00:00 2001
From: RUTHLESS-BOT <wujiafeng@cmbchina.com>
Date: Wed, 13 Aug 2025 04:31:48 +0800
Subject: [PATCH 011/233] [Misc] parametrize 'dtype' in test_flash_mla (#22641)

Signed-off-by: RUTHLESS-BOT <wujiafeng@cmbchina.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 tests/kernels/attention/test_flashmla.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 21b08e45fd6f..81841be58352 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -35,11 +35,10 @@ def cal_diff(x: torch.Tensor, y: torch.Tensor, name: str) -> None:
 @pytest.mark.parametrize("block_size", [64])
 @pytest.mark.parametrize("causal", [True])
 @pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @torch.inference_mode()
 def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
-                   varlen):
-    # TODO: parametrize using pytest
-    dtype = torch.bfloat16
+                   varlen, dtype):
     device = torch.device("cuda:0")
     torch.set_default_dtype(dtype)
     torch.set_default_device(device)
@@ -48,7 +47,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
     random.seed(0)
 
     print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
-          f"{d=}, {dv=}, {causal=}, {varlen=}")
+          f"{d=}, {dv=}, {causal=}, {varlen=}, {dtype=}")
 
     cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
     if varlen:

From 5572b49ca9170bd3eca1297a2aa84d2b1eb163fc Mon Sep 17 00:00:00 2001
From: Frank Wang <41319051+frankwang28@users.noreply.github.com>
Date: Tue, 12 Aug 2025 15:43:06 -0700
Subject: [PATCH 012/233] [Bugfix] Bump DeepGEMM Version to Fix SMXX Layout
 Issues (#22606)

Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index b96d50f0a1c6..a20a4bfb2b88 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -432,7 +432,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 # Install DeepGEMM from source
 ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
-ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
+ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     CUDA_MAJOR="${CUDA_VERSION%%.*}"

From 194a9faad7b640263361cf36ee0c8f8d73e3cfcc Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 13 Aug 2025 01:12:26 +0100
Subject: [PATCH 013/233] [Docs] Hide the navigation and toc sidebars on home
 page (#22749)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/README.md b/docs/README.md
index e8d2fd953a96..683e1d37563f 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,3 +1,9 @@
+---
+hide:
+  - navigation
+  - toc
+---
+
 # Welcome to vLLM
 
 <figure markdown="span">

From b4bcf2b67cffbd1696bbbbfaa1fb7121f3c0a339 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 13 Aug 2025 01:12:30 +0100
Subject: [PATCH 014/233] Fix Transformers backend tensor parallel for
 multimodal models (#22673)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 51 ++++++++++++++--------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 25b8b69e081b..4ec2b683fc33 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -505,30 +505,47 @@ def tensor_parallel(self):
         Apply the model's tensor parallelization plan.
         Currently only supports linear layers.
         """
-        tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
+        # Look for tp plans in all of the PreTrainedModels found in self.model
+        is_pretrained_model = lambda m: isinstance(m, PreTrainedModel)
+        supports_tp_plan = lambda m: m.config.base_model_tp_plan is not None
+        pretrained_models = filter(is_pretrained_model, self.model.modules())
+        models_with_tp_plan = filter(supports_tp_plan, pretrained_models)
 
-        if not tp_plan and self.tp_size > 1:
+        if not any(models_with_tp_plan) and self.tp_size > 1:
             raise ValueError(
                 f"{type(self.model)} does not support tensor parallel yet!")
 
-        # Some weight loaders expect linear layers to inherit from vLLM's
-        # LinearBase class, so we set a default style which causes any
-        # unspecified linear layers to be replaced with ReplicatedLinear
-        tp_plan[".*"] = "replicate"
-
-        def _tensor_parallel(module: nn.Module, prefix: str = ""):
+        def _tensor_parallel(module: nn.Module,
+                             prefix: str = "",
+                             tp_plan=None):
+            tp_plan = tp_plan or {}
+
+            # If the current module is a PreTrainedModel, set the tp_plan for
+            # all of its children
+            if isinstance(module, PreTrainedModel):
+                tp_plan = module.config.base_model_tp_plan or {}
+                tp_plan = {
+                    maybe_prefix(prefix, k): v
+                    for k, v in tp_plan.items()
+                }
+
+            # Some weight loaders expect linear layers to inherit from vLLM's
+            # LinearBase class, so we set a default style which causes any
+            # unspecified linear layers to be replaced with ReplicatedLinear
             for child_name, child_module in module.named_children():
                 qual_name = maybe_prefix(prefix, child_name)
-                for pattern, style in tp_plan.items():
-                    if re.match(pattern, qual_name) and isinstance(
-                            child_module, nn.Linear):
-                        new_module = replace_linear_class(
-                            child_module, style, self.quant_config)
-                        setattr(module, child_name, new_module)
-                        log_replacement(qual_name, child_module, new_module)
-                        break
+                if isinstance(child_module, nn.Linear):
+                    generator = (p for p in tp_plan if re.match(p, qual_name))
+                    pattern = next(generator, None)
+                    style = tp_plan.get(pattern, "replicate")
+                    new_module = replace_linear_class(child_module, style,
+                                                      self.quant_config)
+                    setattr(module, child_name, new_module)
+                    log_replacement(qual_name, child_module, new_module)
                 else:
-                    _tensor_parallel(child_module, prefix=qual_name)
+                    _tensor_parallel(child_module,
+                                     prefix=qual_name,
+                                     tp_plan=tp_plan)
 
         _tensor_parallel(self.model)
 

From 3eca03bf743783dd7159d8aa5c3764234bb0c5b3 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 13 Aug 2025 08:13:17 +0800
Subject: [PATCH 015/233] [Model] Decouple glm4v (#22751)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md        |  2 +-
 vllm/model_executor/models/glm4_1v.py  | 26 +++++++++++++++++++++-----
 vllm/model_executor/models/registry.py |  2 +-
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a24fa4bcce33..dbbbc5122b80 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 7983895687a3..2a89c03bfe7e 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -1227,10 +1227,7 @@ class Glm4vForConditionalGeneration(nn.Module, SupportsMultiModal,
             "k_proj",
             "v_proj",
         ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
+        "gate_up_proj": ["gate_up_proj"]
     }
 
     # To ensure correct weight loading and mapping.
@@ -1567,7 +1564,26 @@ def get_mm_mapping(self) -> MultiModelKeys:
         Get the module prefix in multimodal models
         """
         return MultiModelKeys.from_string_field(
-            language_model="language_model",
+            language_model="language_model.model",
             connector="visual.merger.",
             tower_model="visual.",
         )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Glm4vMultiModalProcessor,
+    info=Glm4vProcessingInfo,
+    dummy_inputs=Glm4vDummyInputsBuilder,
+)
+class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 64dbde4916a2..b817615b4356 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -208,7 +208,7 @@
     "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"),    # noqa: E501
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "Glm4vForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
-    "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vForConditionalGeneration"),  # noqa: E501
+    "Glm4vMoeForConditionalGeneration": ("glm4_1v", "Glm4vMoeForConditionalGeneration"),  # noqa: E501
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),

From e8b1986bfef49212a1175c30ecb957ff9af42805 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 12 Aug 2025 20:14:46 -0400
Subject: [PATCH 016/233] Add hardware plugins to installation doc (#22732)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/getting_started/installation/README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index a252343dcee8..f6ecceb85d86 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -14,3 +14,16 @@ vLLM supports the following hardware platforms:
 - [Google TPU](google_tpu.md)
 - [Intel Gaudi](intel_gaudi.md)
 - [AWS Neuron](aws_neuron.md)
+
+## Hardware Plugins
+
+The backends below live **outside** the main `vllm` repository and follow the
+[Hardware-Pluggable RFC](../design/plugin_system.md).
+
+| Accelerator | PyPI / package | Repository |
+|-------------|----------------|------------|
+| Ascend NPU | `vllm-ascend` | <https://github.com/vllm-project/vllm-ascend> |
+| Intel Gaudi (HPU) | N/A, install from source | <https://github.com/vllm-project/vllm-gaudi> |
+| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
+| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
+| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |

From a48314cdb426996592204d4d13987abadae21958 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Aug 2025 20:18:39 -0700
Subject: [PATCH 017/233] [V0 Deprecation] Remove multi-step scheduling
 (#22138)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 .../tests/genai-perf-tests.json               |   1 -
 .../tests/nightly-tests.json                  |   6 -
 .buildkite/test-pipeline.yaml                 |  22 -
 .github/CODEOWNERS                            |   1 -
 tests/async_engine/test_async_llm_engine.py   | 409 --------
 tests/config/test_config.yaml                 |   1 -
 tests/config/test_config_with_model.yaml      |   1 -
 tests/core/test_chunked_prefill_scheduler.py  |  10 +-
 tests/core/test_num_computed_tokens_update.py |  24 +-
 .../test_multi_step_output_processor.py       | 274 ------
 .../openai/correctness/test_lmeval.py         |   3 -
 tests/metrics/test_metrics.py                 |  39 -
 .../models/language/generation/test_hybrid.py |  26 -
 .../multi_step/test_correctness_async_llm.py  | 232 -----
 tests/multi_step/test_correctness_llm.py      | 383 --------
 tests/samplers/test_logits_processor.py       |  70 --
 tests/tpu/lora/test_lora.py                   |   1 -
 tests/utils_/test_utils.py                    |   2 -
 tests/v1/test_oracle.py                       |   6 -
 tests/worker/test_model_input.py              |  79 --
 vllm/config/__init__.py                       |   2 -
 vllm/core/scheduler.py                        |  92 +-
 vllm/engine/arg_utils.py                      |  43 +-
 vllm/engine/async_llm_engine.py               |  26 +-
 vllm/engine/llm_engine.py                     | 178 +---
 vllm/engine/output_processor/interfaces.py    |  26 +-
 vllm/engine/output_processor/multi_step.py    | 211 ----
 vllm/platforms/cuda.py                        |  14 +-
 vllm/platforms/rocm.py                        |  14 +-
 vllm/platforms/tpu.py                         |   7 +-
 vllm/sequence.py                              |  38 -
 vllm/worker/model_runner.py                   |   7 +-
 vllm/worker/multi_step_model_runner.py        | 908 ------------------
 vllm/worker/multi_step_neuron_model_runner.py |  84 --
 ...i_step_neuronx_distributed_model_runner.py |  63 --
 vllm/worker/multi_step_worker.py              | 197 ----
 vllm/worker/neuron_worker.py                  |  22 +-
 37 files changed, 57 insertions(+), 3465 deletions(-)
 delete mode 100644 tests/async_engine/test_async_llm_engine.py
 delete mode 100644 tests/engine/test_multi_step_output_processor.py
 delete mode 100644 tests/multi_step/test_correctness_async_llm.py
 delete mode 100644 tests/multi_step/test_correctness_llm.py
 delete mode 100644 tests/samplers/test_logits_processor.py
 delete mode 100644 vllm/engine/output_processor/multi_step.py
 delete mode 100644 vllm/worker/multi_step_model_runner.py
 delete mode 100644 vllm/worker/multi_step_neuron_model_runner.py
 delete mode 100644 vllm/worker/multi_step_neuronx_distributed_model_runner.py
 delete mode 100644 vllm/worker/multi_step_worker.py

diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
index f26ae7634f3d..afb844880f9f 100644
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@@ -12,7 +12,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 41b4a4008801..423a3bfe1267 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -36,7 +36,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -90,7 +89,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -144,7 +142,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -195,7 +192,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -248,7 +244,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
@@ -301,7 +296,6 @@
         "vllm_server_parameters": {
             "disable_log_stats": "",
             "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
             "max_num_seqs": 512,
             "dtype": "bfloat16"
         },
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ebcf51981ef3..740be2bc8770 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -67,7 +67,6 @@ steps:
   - python3 standalone_tests/lazy_imports.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
-  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
@@ -773,27 +772,6 @@ steps:
   - pytest -v -s models/test_oot_registration.py # it needs a clean process
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
-- label: Multi-step Tests (4 GPUs) # 36min
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/model_executor/layers/sampler.py
-  - vllm/sequence.py
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/multi_step_worker.py
-  - vllm/worker/model_runner_base.py
-  - vllm/worker/model_runner.py
-  - vllm/worker/multi_step_model_runner.py
-  - vllm/engine
-  - tests/multi_step
-  commands:
-  # this test is quite flaky
-  # TODO: investigate and fix.
-  # - pytest -v -s multi_step/test_correctness_async_llm.py
-  - pytest -v -s multi_step/test_correctness_llm.py
-
 - label: Pipeline Parallelism Test # 45min
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index a0a327319a46..b0dd5e99d4c7 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -36,7 +36,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
-/tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
deleted file mode 100644
index 0eb7a6eb52aa..000000000000
--- a/tests/async_engine/test_async_llm_engine.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-import uuid
-from asyncio import CancelledError
-from copy import copy
-from dataclasses import dataclass, field
-from typing import Any, Optional
-
-import pytest
-import pytest_asyncio
-import torch
-
-from vllm import SamplingParams
-from vllm.config import ParallelConfig
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
-from vllm.outputs import RequestOutput as RealRequestOutput
-from vllm.sampling_params import RequestOutputKind
-
-from ..utils import wait_for_gpu_memory_to_clear
-
-
-@dataclass
-class RequestOutput:
-    request_id: int
-    finished: bool = False
-
-
-@dataclass
-class MockModelConfig:
-    use_async_output_proc = True
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-
-
-class MockEngine:
-
-    def __init__(self):
-        self.step_calls = 0
-        self.add_request_calls = 0
-        self.abort_request_calls = 0
-        self.request_id = None
-        # Ugly, remove dependency when possible
-        self.parallel_config = ParallelConfig()
-        self.model_config = MockModelConfig()
-
-    async def step_async(self, virtual_engine):
-        # PP size is 1, ignore virtual engine
-        self.step_calls += 1
-        return [RequestOutput(
-            request_id=self.request_id)] if self.request_id else []
-
-    async def process_model_inputs_async(self, *args, **kwargs):
-        pass
-
-    async def stop_remote_worker_execution_loop_async(self):
-        pass
-
-    def generate(self, request_id):
-        self.request_id = request_id
-
-    def stop_generating(self):
-        self.request_id = None
-
-    def add_request(self, **kwargs):
-        del kwargs  # Unused
-        self.add_request_calls += 1
-        print(f'Request calls: {self.add_request_calls}')
-
-    async def add_request_async(self, **kwargs):
-        self.add_request_calls += 1
-        return
-
-    def abort_request(self, request_id):
-        del request_id  # Unused
-        self.abort_request_calls += 1
-
-    def has_unfinished_requests(self):
-        return self.request_id is not None
-
-    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
-        return self.request_id is not None
-
-
-class MockAsyncLLMEngine(AsyncLLMEngine):
-    _engine_class = MockEngine
-
-
-@pytest.mark.asyncio
-async def test_new_requests_event():
-    params = SamplingParams()
-
-    engine = MockAsyncLLMEngine()
-    engine.start_background_loop()
-    await asyncio.sleep(0.01)
-    assert engine.engine.step_calls == 0
-
-    await engine.add_request("1", "", params)
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 1
-    assert engine.engine.step_calls == 1
-
-    await engine.add_request("2", "", params)
-    engine.engine.generate("2")
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-    assert engine.engine.add_request_calls == 2
-    assert engine.engine.step_calls >= 2
-    await asyncio.sleep(0.001)
-    assert engine.engine.step_calls >= 3
-    engine.engine.stop_generating()
-    await asyncio.sleep(0.001)
-    old_step_calls = engine.engine.step_calls
-    await asyncio.sleep(0.001)
-    assert engine.engine.step_calls == old_step_calls
-
-    await engine.add_request("3", "", params)
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == old_step_calls + 1
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == old_step_calls + 1
-
-    engine = MockAsyncLLMEngine()
-    assert engine.get_model_config() is not None
-    assert engine.get_tokenizer() is not None
-    assert engine.get_decoding_config() is not None
-
-
-def start_engine():
-    wait_for_gpu_memory_to_clear(
-        devices=list(range(torch.cuda.device_count())),
-        threshold_bytes=2 * 2**30,
-        timeout_s=60,
-    )
-
-    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
-    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
-
-    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m",
-                        enforce_eager=True,
-                        num_scheduler_steps=num_scheduler_steps))
-
-
-def uid() -> str:
-    return str(uuid.uuid4())
-
-
-@pytest_asyncio.fixture(scope="module")
-async def async_engine():
-    # We cannot use monkeypatch since this is a module
-    # scoped fixture and monkeypatch is function scoped.
-    previous_value = os.getenv("VLLM_USE_V1", None)
-    os.environ["VLLM_USE_V1"] = "0"
-    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
-                                                            func=start_engine)
-    try:
-        yield engine
-    finally:
-        engine.shutdown_background_loop()
-        del engine
-        await asyncio.sleep(0.1)
-        cleanup_dist_env_and_memory()
-
-        if previous_value:
-            os.environ["VLLM_USE_V1"] = previous_value
-        else:
-            del os.environ["VLLM_USE_V1"]
-
-
-@pytest.fixture()
-def should_do_global_cleanup_after_test(request) -> bool:
-    # So we can share the async engine fixture between these tests
-    return False
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_asyncio_run(async_engine, stop):
-
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    async def run(prompt: str):
-        sampling_params = SamplingParams(
-            temperature=0,
-            max_tokens=32,
-            min_tokens=32,
-            stop=stop,
-        )
-
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  sampling_params,
-                                                  request_id=uid()):
-            output_count += 1
-            final_output = output
-        return final_output, output_count
-
-    results = await asyncio.gather(
-        run("test0"),
-        run("test0"),
-    )
-    assert len(results) == 2
-    first, second = results
-
-    # remove nondeterministic fields for comparison
-    first[0].metrics = None
-    second[0].metrics = None
-    first[0].request_id = None
-    second[0].request_id = None
-
-    assert str(first) == str(second)
-
-    output_count = results[0][1]
-    if num_scheduler_steps == 1:
-        assert output_count == 32
-    else:
-        assert 1 < output_count < 32
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_output_kinds(async_engine, stop):
-    """Test that output_kind works as expected and that
-    results are equivalent across different kinds."""
-
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=32,
-        min_tokens=32,
-        stop=stop,
-    )
-
-    async def run(prompt: str, kind: RequestOutputKind):
-        params = copy(sampling_params)
-        params.output_kind = kind
-
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
-            output_count += 1
-            final_output = output
-
-        assert final_output is not None
-        assert final_output.finished
-
-        return (final_output.prompt_token_ids,
-                final_output.outputs[0].token_ids,
-                final_output.outputs[0].text, output_count)
-
-    async def run_deltas(prompt: str):
-        params = copy(sampling_params)
-        params.output_kind = RequestOutputKind.DELTA
-
-        prompt_tokens = None
-        output_tokens: list[int] = []
-        output_text = ""
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
-            token_ids = output.outputs[0].token_ids
-            text = output.outputs[0].text
-            final_output = output
-
-            # Ensure we get prompt ids iff we haven't yet received output tokens
-            if output_tokens:
-                assert 1 <= len(token_ids) <= num_scheduler_steps
-                assert stop or text
-                assert not output.prompt_token_ids
-            else:
-                assert output.prompt_token_ids
-                prompt_tokens = output.prompt_token_ids
-
-            output_tokens.extend(token_ids)
-            output_text += text
-
-            output_count += 1
-
-        assert final_output is not None
-        assert final_output.finished
-
-        return prompt_tokens, output_tokens, output_text, output_count
-
-    results = await asyncio.gather(
-        run("common input prompt", RequestOutputKind.CUMULATIVE),
-        run("common input prompt", RequestOutputKind.FINAL_ONLY),
-        run_deltas("common input prompt"))
-
-    # Make sure outputs are the same
-    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
-    assert len(prompt_set) == 1
-
-    text_set = set(text for _, _, text, _ in results)
-    assert len(text_set) == 1
-
-    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
-    assert len(tokens_set) == 1
-
-    cumulative, final, deltas = results
-
-    # output message counts
-    assert cumulative[3] == deltas[3]
-
-    if num_scheduler_steps == 1:
-        assert cumulative[3] == 32
-    else:
-        assert 1 < cumulative[3] < 32
-
-    assert final[3] == 1
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_cancellation(async_engine, stop):
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=13,
-        max_tokens=13,
-        stop=stop,
-    )
-
-    stop_at = 5 if num_scheduler_steps == 1 else 1
-
-    request_id = uid()
-
-    i = 0
-    with pytest.raises(CancelledError):
-        async for output in async_engine.generate("test2",
-                                                  sampling_params,
-                                                  request_id=request_id):
-            assert not output.finished
-            i += 1
-            if i == stop_at:
-                await async_engine.abort(request_id)
-
-    assert i == stop_at
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_delayed_generator(async_engine, stop):
-    scheduler_config = await async_engine.get_scheduler_config()
-
-    if scheduler_config.num_scheduler_steps != 1:
-        pytest.skip("no need to test this one with multistep")
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
-        stop=stop,
-    )
-
-    stream = async_engine.generate("test3", sampling_params, request_id=uid())
-    i = 0
-    final_output: Optional[RealRequestOutput] = None
-    async for output in stream:
-        final_output = output
-        if i == 0:
-            # wait for generation to complete before consuming
-            # the remaining messages
-            await asyncio.sleep(1)
-        if i < 9:
-            assert not output.finished
-        i += 1
-
-    assert i == 10
-    assert final_output is not None
-    assert len(final_output.outputs[0].token_ids) == 10
-    assert final_output.finished
-
-
-@pytest.mark.asyncio(scope="module")
-async def test_invalid_argument(async_engine):
-    scheduler_config = await async_engine.get_scheduler_config()
-
-    if scheduler_config.num_scheduler_steps != 1:
-        pytest.skip("no need to test this one with multistep")
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
-    )
-
-    # Targeting specific DP rank only supported in v1 multi-instance DP
-    with pytest.raises(ValueError):
-        async for _ in async_engine.generate("test",
-                                             sampling_params,
-                                             request_id=uid(),
-                                             data_parallel_rank=0):
-            pass
diff --git a/tests/config/test_config.yaml b/tests/config/test_config.yaml
index 5090e8f357bb..a16857b5f2fb 100644
--- a/tests/config/test_config.yaml
+++ b/tests/config/test_config.yaml
@@ -2,4 +2,3 @@ port: 12312
 served_model_name: mymodel
 tensor_parallel_size: 2
 trust_remote_code: true
-multi_step_stream_outputs: false
diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml
index d8c8c7bc8162..9fbdb77d4ef2 100644
--- a/tests/config/test_config_with_model.yaml
+++ b/tests/config/test_config_with_model.yaml
@@ -4,4 +4,3 @@ port: 12312
 served_model_name: mymodel
 tensor_parallel_size: 2
 trust_remote_code: true
-multi_step_stream_outputs: false
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index d4dacc4f1296..ce1fe189b3ca 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -644,11 +644,9 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots):
     assert out.num_batched_tokens == max_num_batched_tokens
 
 
-@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
-def test_chunked_prefill_spec_prefill(num_scheduler_steps):
+def test_chunked_prefill_spec_prefill():
     """Verify that the num_lookahead_slots is set appropriately for an all"""
-    """prefill batch depending on whether multi-step scheduling is enabled"""
-    """or not"""
+    """prefill batch."""
     block_size = 4
     max_seqs = 30
     max_model_len = 200
@@ -661,7 +659,6 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
         max_model_len,
         enable_chunked_prefill=True,
         num_lookahead_slots=num_lookahead_slots,
-        num_scheduler_steps=num_scheduler_steps,
     )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 16
@@ -679,8 +676,7 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
     assert out.num_prefill_groups == 1
     assert out.num_batched_tokens == max_num_batched_tokens
     print(out.num_lookahead_slots)
-    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
-                                       num_lookahead_slots)
+    assert out.num_lookahead_slots == 0
 
 
 def test_chunked_prefill_max_seqs():
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index 9e1b7913dfb9..131a7b3a6299 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -6,7 +6,6 @@
 from tests.conftest import VllmRunner
 from tests.core.utils import create_dummy_prompt
 from vllm.engine.llm_engine import LLMEngine
-from vllm.platforms import current_platform
 from vllm.sequence import SequenceGroup
 
 MODEL = "JackFram/llama-160m"
@@ -17,32 +16,19 @@ def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
     scheduler.add_seq_group(seq_group)
 
 
-@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
 @pytest.mark.parametrize("enable_chunked_prefill", [False, True])
 @pytest.mark.parametrize("enforce_eager", [False, True])
-def test_num_computed_tokens_update(num_scheduler_steps: int,
-                                    enable_chunked_prefill: bool,
+def test_num_computed_tokens_update(enable_chunked_prefill: bool,
                                     enforce_eager: bool):
 
-    is_multi_step = num_scheduler_steps > 1
-    is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
-
-    if is_multi_step_chunked_prefill and current_platform.is_rocm():
-        pytest.skip("Multi-step with Chunked-Prefill does not support "
-                    "rocm_flash_attn backend")
-
     # Make a vllm engine
     runner = VllmRunner(model_name=MODEL,
                         gpu_memory_utilization=0.7,
-                        num_scheduler_steps=num_scheduler_steps,
                         enable_chunked_prefill=enable_chunked_prefill,
                         enforce_eager=enforce_eager)
     engine: LLMEngine = runner.llm.llm_engine
 
-    # In multi-step + chunked-prefill there is no separate single prompt step.
-    # What is scheduled will run for num_scheduler_steps always.
-    num_prompt_steps = num_scheduler_steps \
-        if is_multi_step_chunked_prefill else 1
+    num_prompt_steps = 1
 
     num_output_tokens_list = [4, 8, 12, 15, 16, 17]
 
@@ -73,10 +59,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
                 # Test correctness of num_computed_tokens after the decode steps
                 assert seq.data.get_num_computed_tokens(
                 ) == prompt_num_computed_tokens + decode_step_counter
-                for _ in range(num_scheduler_steps):
-                    # decode step
-                    engine.step()
-                    decode_step_counter += 1
+                engine.step()
+                decode_step_counter += 1
 
         # Test correctness of num_computed_tokens after the sequence finish.
         assert seq.data.get_num_computed_tokens(
diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py
deleted file mode 100644
index 458f4deb743a..000000000000
--- a/tests/engine/test_multi_step_output_processor.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from unittest.mock import MagicMock
-
-import pytest
-from transformers import PreTrainedTokenizer
-
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput, SequenceStatus)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.utils import Counter
-
-from ..core.utils import create_seq_group
-
-
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [1, 12])
-@pytest.mark.skip_global_cleanup
-def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
-    """Verify multi-step decoding appends token ids correctly.
-
-    We append token ids and verify all the token ids were appended correctly.
-    Note that ignore_eos=True.
-    """
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=1024,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=seq_output_len +
-                                       num_new_tokens,
-                                       ignore_eos=True),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
-    output_processor.process_outputs(seq_group, outputs)
-    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
-@pytest.mark.parametrize("max_tokens", [128 + 3])
-@pytest.mark.skip_global_cleanup
-def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
-                             seq_output_len: int, max_tokens: int):
-    """Verify tokens after max_tokens are dropped and not appended to the
-    sequence.
-    """
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=max_tokens, ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to not go over max tokens in len.
-    assert seq.get_len() == seq_prompt_len + max_tokens
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [12])
-@pytest.mark.parametrize("seed", list(range(6)))
-@pytest.mark.skip_global_cleanup
-def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                               seq_output_len: int, seed: int):
-    """Verify the eos token id is included in the sequence, but subsequent
-    tokens are dropped (not appended to sequence).
-    """
-    random.seed(seed)
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    eos_token_id = 100
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(
-            # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens, ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-    assert eos_token_id not in new_token_ids
-    eos_index = random.randint(0, len(new_token_ids) - 1)
-    new_token_ids[eos_index] = eos_token_id
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to not go beyond provided eos.
-    assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:eos_index + 1]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [12])
-@pytest.mark.parametrize("seed", list(range(6)))
-@pytest.mark.skip_global_cleanup
-def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                              seq_output_len: int, seed: int):
-    """When sampling parameters dictate that we should ignore the eos token id,
-    ensure all token ids are appended even if the eos token id is emitted.
-    """
-    random.seed(seed)
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    eos_token_id = 100
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(
-            # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens,
-            ignore_eos=True,
-        ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-    assert eos_token_id not in new_token_ids
-    eos_index = random.randint(0, len(new_token_ids) - 1)
-    new_token_ids[eos_index] = eos_token_id
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to go beyond eos.
-    assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
-                                             seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-def mock_tokenizer(eos_token_id=1000):
-    tokenizer = MagicMock(spec=PreTrainedTokenizer)
-    tokenizer.eos_token_id = eos_token_id
-    return tokenizer
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index d75731637d28..684407cd6ee9 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -26,15 +26,12 @@
 MORE_ARGS_LIST = [
     [],  # Default
     ["--enable-chunked-prefill"],  # Chunked
-    ["--num-scheduler-steps", "8"],  # MS
-    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
 ]
 MAX_WAIT_SECONDS = None
 
 if current_platform.is_tpu():
     MORE_ARGS_LIST = [
         [],  # Default
-        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
     ]
     MAX_WAIT_SECONDS = 600
 
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 8cae8a80d38e..dbd9c518e020 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -94,45 +94,6 @@ def test_metric_counter_generation_tokens(
         f"metric: {metric_count!r}")
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [128, 129])
-@pytest.mark.parametrize("disable_async_output_proc", [True, False])
-def test_metric_counter_generation_tokens_multi_step(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    disable_async_output_proc: bool,
-) -> None:
-    num_scheduler_steps = 8
-    with vllm_runner(
-            model,
-            disable_log_stats=False,
-            gpu_memory_utilization=0.4,
-            num_scheduler_steps=num_scheduler_steps,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.llm.get_tokenizer()
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-            **stat_logger.labels)._value.get()
-        vllm_generation_count = 0
-        for i in range(len(example_prompts)):
-            vllm_output_ids, vllm_output_str = vllm_outputs[i]
-            prompt_ids = tokenizer.encode(example_prompts[i])
-            # vllm_output_ids contains both prompt tokens and generation tokens.
-            # We're interested only in the count of the generation tokens.
-            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
-
-    # The multi-step scheduling will continue to execute forward even when
-    # encountering EOS, leading to slightly imprecise metrics.
-    assert abs(vllm_generation_count - metric_count) <\
-        len(example_prompts) * num_scheduler_steps, \
-        (f"generation token count: {vllm_generation_count!r}\n"
-         f"metric: {metric_count!r}")
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize(
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 76f6c226bab7..19fcbf561640 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -331,32 +331,6 @@ def test_state_cleanup(
                     "could be related to finished_requests_ids")
 
 
-@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
-@pytest.mark.parametrize("max_tokens", [64])
-def test_multistep_correctness(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-) -> None:
-    with vllm_runner(model, num_scheduler_steps=8,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_multistep = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    with vllm_runner(model, num_scheduler_steps=1,
-                     max_num_seqs=2) as vllm_model:
-        vllm_outputs_single_step = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=vllm_outputs_multistep,
-        outputs_1_lst=vllm_outputs_single_step,
-        name_0="vllm_outputs_multistep",
-        name_1="vllm_outputs_single_step",
-    )
-
-
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
deleted file mode 100644
index 56e339d485c5..000000000000
--- a/tests/multi_step/test_correctness_async_llm.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Test the AsyncLLMEngine with multi-step-decoding
-from typing import Optional
-
-import pytest
-
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close
-from ..utils import (completions_with_server_args, get_client_text_generations,
-                     get_client_text_logprob_generations)
-
-MODELS = [
-    "JackFram/llama-160m",
-]
-NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
-NUM_PROMPTS = [10]
-
-DEFAULT_SERVER_ARGS: list[str] = [
-    "--distributed-executor-backend",
-    "ray",
-    "--gpu-memory-utilization",
-    "0.85",
-    "--swap-space",
-    "16",
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize(("tp_size, pp_size"), [
-    (1, 1),
-    (2, 2),
-])
-@pytest.mark.parametrize("eager_mode", [False, True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("is_async", [True])
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
-@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
-@pytest.mark.asyncio
-async def test_multi_step(
-    example_prompts,
-    model: str,
-    tp_size: int,
-    pp_size: int,
-    eager_mode: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    is_async: bool,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    enable_chunked_prefill: bool,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
-    client/server environment.
-
-    Set up an engine with single-step scheduling as a ground-truth reference.
-
-    Send a completions API request to both engines with the same prompts.
-
-    Validate:
-    * Generated tokens match
-    * Generated logprobs are all very close
-
-    Args:
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      tp_size: degree of tensor-parallelism
-      pp_size: degree of pipeline-parallelism
-      eager_mode
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
-    """
-    if enable_chunked_prefill and \
-        (pp_size > 1 or attention_backend != "FLASH_ATTN"):
-        pytest.skip("Multi-step with Chunked-Prefill only supports"
-                    "PP=1 and FLASH_ATTN backend")
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-        ms_server_args = DEFAULT_SERVER_ARGS + \
-            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
-
-        if not is_async:
-            ms_server_args += ["--disable-async-output-proc"]
-
-        if eager_mode:
-            ms_server_args.append("--enforce-eager")
-
-        if enable_chunked_prefill:
-            ms_server_args.append("--enable-chunked-prefill")
-
-        distributed_args = [
-            "--tensor-parallel-size",
-            str(tp_size),
-            "--pipeline-parallel-size",
-            str(pp_size),
-        ]
-
-        # Spin up client/server & issue completion API requests.
-        # Default `max_wait_seconds` is 240 but was empirically
-        # was raised 5x to 1200 *just for this test* due to
-        # observed timeouts in GHA CI
-        ref_completions = await completions_with_server_args(
-            prompts,
-            model,
-            server_args + distributed_args,
-            num_logprobs,
-            max_wait_seconds=5 * 240)
-        test_completions = await completions_with_server_args(
-            prompts,
-            model,
-            ms_server_args + distributed_args,
-            num_logprobs,
-            max_wait_seconds=5 * 240)
-
-        # Assert multi-step scheduling produces identical tokens
-        # to single-step scheduling.
-        ref_generations = get_client_text_generations(ref_completions)
-        test_generations = get_client_text_generations(test_completions)
-        assert ref_generations == test_generations
-
-        # Assert multi-step scheduling produces nearly-identical logprobs
-        # to single-step scheduling.
-        ref_text_logprobs = get_client_text_logprob_generations(
-            ref_completions)
-        test_text_logprobs = get_client_text_logprob_generations(
-            test_completions)
-        check_logprobs_close(
-            outputs_0_lst=ref_text_logprobs,
-            outputs_1_lst=test_text_logprobs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize(("tp_size, pp_size"), [
-    (1, 2),
-])
-@pytest.mark.asyncio
-async def test_multi_step_pp_smoke(
-    tp_size: int,
-    pp_size: int,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Smoke test for the vLLM engine with multi-step scheduling in an
-    OpenAI-protocol client/server environment.
-
-    This tests compares the outputs between multi-step scheduling and
-    single-step scheduling. Notably, this test lets the engines generate
-    more tokens (default is 5) and test for an exact match over all the
-    tokens.
-
-    Args:
-      tp_size: degree of tensor-parallelism
-      pp_size: degree of pipeline-parallelism
-      eager_mode
-    """
-
-    model = "JackFram/llama-160m"
-    num_scheduler_steps = 8
-    attention_backend = "FLASH_ATTN"
-    max_num_seqs = 3
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        # Prompt from the ShareGPT dataset
-        prompts = [
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-        ]
-        # Use varying max_tokens to introduce scheduling randomness.
-        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
-        assert len(prompts) == len(max_tokens)
-
-        test_args = [
-            "--tensor-parallel-size",
-            str(tp_size), "--pipeline-parallel-size",
-            str(pp_size), "--max-num-seqs",
-            str(max_num_seqs)
-        ]
-
-        server_args = DEFAULT_SERVER_ARGS + test_args
-        ms_server_args = DEFAULT_SERVER_ARGS + \
-          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-          test_args
-
-        # Spin up client/server & issue completion API requests.
-        # Default `max_wait_seconds` is 240 but was empirically
-        # was raised 3x to 720 *just for this test* due to
-        # observed timeouts in GHA CI
-        ref_completions = await completions_with_server_args(
-            prompts=prompts,
-            model_name=model,
-            server_cli_args=server_args,
-            num_logprobs=None,
-            max_wait_seconds=5 * 240,
-            max_tokens=max_tokens)
-
-        test_completions = await completions_with_server_args(
-            prompts=prompts,
-            model_name=model,
-            server_cli_args=ms_server_args,
-            num_logprobs=None,
-            max_wait_seconds=5 * 240,
-            max_tokens=max_tokens)
-
-        # Assert multi-step scheduling produces identical tokens
-        # to single-step scheduling.
-        ref_generations = get_client_text_generations(ref_completions)
-        test_generations = get_client_text_generations(test_completions)
-
-        assert ref_generations == test_generations
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
deleted file mode 100644
index 0df00c98b72c..000000000000
--- a/tests/multi_step/test_correctness_llm.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Test the LLMEngine with multi-step-decoding
-
-import copy
-from typing import Optional
-
-import pytest
-
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close, check_outputs_equal
-
-MODELS = [
-    "JackFram/llama-160m",
-]
-NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
-NUM_PROMPTS = [10]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True, False])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"])
-def test_multi_step_llm(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    enable_chunked_prefill: bool,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step scheduling via sync LLM Engine.
-
-    Set up a HuggingFace (HF) transformers model as a ground-truth reference.
-
-    Prompt them with the same example prompts.
-
-    Validate:
-    * Generated tokens match
-    * Generated logprobs are all very close
-
-    Args:
-      hf_runner: HF transformers model runner fixture
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      enable_chunked_prefill: chunked-prefill on/off
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> 1 logprob returned.
-    """
-    if current_platform.is_rocm() and \
-        (attention_backend == "FLASHINFER" or enable_chunked_prefill):
-        pytest.skip(
-            "Multi-Step with FLASHINFER or Chunked-Prefill is not supported"
-            "on ROCm")
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                enable_chunked_prefill=enable_chunked_prefill,
-                num_scheduler_steps=num_scheduler_steps,
-        ) as vllm_model:
-            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                            if num_logprobs is None else
-                            vllm_model.generate_greedy_logprobs(
-                                prompts, max_tokens, num_logprobs))
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                          if num_logprobs is None else
-                          hf_model.generate_greedy_logprobs_limit(
-                              prompts, max_tokens, num_logprobs))
-
-        if num_logprobs is None:
-            check_outputs_equal(
-                outputs_0_lst=hf_outputs,
-                outputs_1_lst=vllm_outputs,
-                name_0="hf",
-                name_1="vllm",
-            )
-        else:
-            check_logprobs_close(
-                outputs_0_lst=hf_outputs,
-                outputs_1_lst=vllm_outputs,
-                name_0="hf",
-                name_1="vllm",
-            )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
-def test_multi_step_llm_w_prompt_logprobs(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    num_prompt_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
-
-    Set up a vLLM engine instance w/ single-step scheduling as a ground-truth
-    reference.
-
-    Prompt them with the same example prompts.
-
-    Validate:
-    * All generated logprobs are all very close
-
-    Args:
-      hf_runner: HF transformers model runner fixture
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
-      num_prompt_logprobs: number of logprobs to return for each prompt token;
-                           note that this argument is not supported by the
-                           OpenAI completions endpoint.
-    """
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                num_scheduler_steps=num_scheduler_steps,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs,
-                num_prompt_logprobs=num_prompt_logprobs)
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-        ) as vllm_model:
-            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs,
-                num_prompt_logprobs=num_prompt_logprobs)
-
-        check_logprobs_close(
-            outputs_0_lst=single_step_vllm_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
-@pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="Multi-Step + Chunked-Prefill not supported on ROCm")
-def test_multi_step_llm_chunked_prefill_prefix_cache(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
-
-    Set up contrived scenario which tests for a possible failure mode of
-    scheduling with multi-step+"single-step chunked prefill"+APC
-
-    "single-step chunked prefill" here refers to the current vLLM multi-step+
-    chunked-prefill implementation, which requires that a prefill may only
-    be scheduled in the same step as decodes if the prefill prompt fits in a
-    single chunk (note that "complete" multi-step+chunked-prefill would allow
-    a prefill to span multiple chunks & multiple steps but that is not yet
-    the case.)
-
-    "APC" is short for "automatic prefix caching".
-
-    This test creates a scenario where the scheduler must decide whether/how
-    to schedule a prefill with a prompt that exceeds the available token budget.
-    The correct behavior for multi-step+"single-step chunked prefill"+APC is to
-    put off scheduling the prefill until a future step.
-
-    Validate that:
-    * Multi-step kernels do not raise an exception due to incorrect scheduler
-      behavior
-    * Generated tokens match between
-      multi-step+"single-step chunked prefill"+APC and
-      single-step scheduling.
-    * (If logprobs are enabled) check logprobs are close enough
-
-    Args:
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> 1 logprob returned.
-    """
-
-    # Set up contrived test for correct scheduling behavior with
-    # multi-step+"single-step chunked prefill"+APC.
-    #
-    # Assume block_size=16
-    #
-    # Assume max_num_batched_tokens=48
-    #   => Per-step token budget=48
-    #
-    # 1. Scheduler schedules 0th prompt (24 tokens)
-    #      => Remaining token budget=24
-    # 2. Scheduler attempts to schedule 1st prompt (30 tokens)
-    #    * 30 tokens exceeds 24 token remaining budget
-    #    * Correct behavior: do not schedule this prompt in this step
-    #    * Incorrect behavior: schedule prompt chunk
-    #      * `do_sample=False` for this prompt in this step
-    #      * Chunk size = (remaining tokens // block size) * block size
-    #
-    # The Incorrect scheduling behavior - if it occurs - will cause an exception
-    # in the model runner resulting from `do_sample=False`.
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        assert len(example_prompts) >= 2
-        challenge_prompts = copy.deepcopy(example_prompts)
-        challenge_prompts[0] = (
-            'vLLM is a high-throughput and memory-efficient '
-            'inference and serving engine for LLMs.\n')  # 24 tok
-        challenge_prompts[1] = (
-            'Briefly describe the major milestones in the '
-            'development of artificial intelligence from 1950 to 2020.\n'
-        )  # 30 tok
-
-        # If necessary, adjust the length of `challenge_prompts` to match
-        # `num_prompts`
-        if len(challenge_prompts) < num_prompts:
-            challenge_prompts = (challenge_prompts *
-                                 ((num_prompts // len(challenge_prompts)) + 1))
-        challenge_prompts = challenge_prompts[:num_prompts]
-        assert len(challenge_prompts) == num_prompts
-
-        # Single-step scheduler baseline
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                num_scheduler_steps=num_scheduler_steps,
-                max_model_len=48,
-                max_num_batched_tokens=48,
-                max_num_seqs=4,
-                block_size=16,
-        ) as vllm_model:
-            outputs_baseline = (
-                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                    challenge_prompts, max_tokens, num_logprobs))
-
-        # multi-step+"single-step chunked prefill"+APC
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                num_scheduler_steps=num_scheduler_steps,
-                max_model_len=48,
-                max_num_batched_tokens=48,
-                max_num_seqs=4,
-                block_size=16,
-        ) as vllm_model:
-            outputs_w_features = (
-                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                    challenge_prompts, max_tokens, num_logprobs))
-
-        if num_logprobs is None:
-            # No-logprobs test
-            check_outputs_equal(
-                outputs_0_lst=outputs_baseline,
-                outputs_1_lst=outputs_w_features,
-                name_0="multi-step",
-                name_1="multi-step+features",
-            )
-        else:
-            # Yes-logprobs test
-            check_logprobs_close(
-                outputs_0_lst=outputs_baseline,
-                outputs_1_lst=outputs_w_features,
-                name_0="multi-step",
-                name_1="multi-step+features",
-            )
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
deleted file mode 100644
index 123f9595e97b..000000000000
--- a/tests/samplers/test_logits_processor.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm import SamplingParams
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This file tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_logits_processor_force_generate(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        tokenizer = vllm_model.llm.get_tokenizer()
-        repeat_times = 2
-        enforced_answers = " vLLM"
-        vllm_token_ids = tokenizer.encode(enforced_answers,
-                                          add_special_tokens=False)
-        max_tokens = len(vllm_token_ids) * repeat_times
-
-        def pick_vllm(token_ids, logits):
-            token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
-            logits[token_id] = torch.finfo(logits.dtype).max
-            return logits
-
-        params_with_logprobs = SamplingParams(
-            logits_processors=[pick_vllm],
-            prompt_logprobs=3,
-            max_tokens=max_tokens,
-        )
-
-        # test logits_processors when prompt_logprobs is not None
-        vllm_model.llm._add_request(
-            example_prompts[0],
-            params=params_with_logprobs,
-        )
-
-        # test prompt_logprobs is not None
-        vllm_model.llm._add_request(
-            example_prompts[1],
-            params=SamplingParams(
-                prompt_logprobs=3,
-                max_tokens=max_tokens,
-            ),
-        )
-
-        # test grouped requests
-        vllm_model.llm._add_request(
-            example_prompts[2],
-            params=SamplingParams(max_tokens=max_tokens),
-        )
-
-        outputs = vllm_model.llm._run_engine(use_tqdm=False)
-
-        assert outputs[0].outputs[0].text == enforced_answers * repeat_times
diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py
index 4c47b8c43caf..636108e98581 100644
--- a/tests/tpu/lora/test_lora.py
+++ b/tests/tpu/lora/test_lora.py
@@ -30,7 +30,6 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch):
 
 def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
     return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct",
-                    num_scheduler_steps=1,
                     max_model_len=256,
                     max_seq_len_to_capture=256,
                     max_num_seqs=8,
diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index a2db1ae68434..8be1e103dc65 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -236,7 +236,6 @@ def test_config_args(parser_with_config, cli_config_file):
         ['serve', 'mymodel', '--config', cli_config_file])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code
-    assert not args.multi_step_stream_outputs
 
 
 def test_config_file(parser_with_config):
@@ -828,7 +827,6 @@ def test_model_specification(parser_with_config, cli_config_file,
     ])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code is True
-    assert args.multi_step_stream_outputs is False
     assert args.port == 12312
 
 
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index a756c89b520f..1f16e92f657e 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -58,12 +58,6 @@ def test_unsupported_configs(monkeypatch):
                 disable_async_output_proc=True,
             ).create_engine_config()
 
-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                num_scheduler_steps=5,
-            ).create_engine_config()
-
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index ec33d334ab65..2031f41fab87 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -11,7 +11,6 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-from vllm.worker.multi_step_model_runner import StatefulModelInput
 from vllm.worker.pooling_model_runner import (
     ModelInputForGPUWithPoolingMetadata)
 
@@ -166,81 +165,3 @@ def test_embedding_model_runner_input():
                        None) == getattr(attn_metadata, field.name, None)
     # Pooling metadata is not broadcast.
     assert received_model_input.pooling_metadata is None
-
-
-def test_multi_step_model_runner_input():
-    sampling_metadata = SamplingMetadata(
-        ["seq_group"],
-        "selected_token_indices",
-        "categorized_sample_indices",
-        "num_prompts",
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    frozen_model_input = ModelInputForGPUWithSamplingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        sampling_metadata=sampling_metadata,
-        attn_metadata=attn_metadata)
-
-    model_input = StatefulModelInput(
-        frozen_model_input=frozen_model_input,
-        is_last_step=True,
-        is_first_multi_step=False,
-        current_step=4,
-        last_sampled_token_ids=torch.ones((10, 1)),
-        is_multi_step=True,
-        num_queries=8,
-        num_seqs=5,
-        cached_outputs=[],
-    )
-
-    assert isinstance(model_input, StatefulModelInput)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
-        tensor_dict, attn_backend=attn_backend))
-
-    received_frozen_input = received_model_input.frozen_model_input
-
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input, StatefulModelInput)
-    assert received_frozen_input.input_tokens is not None
-    assert (received_frozen_input.input_tokens ==
-            frozen_model_input.input_tokens).all()
-    assert received_frozen_input.input_positions is not None
-    assert (received_frozen_input.input_positions ==
-            frozen_model_input.input_positions).all()
-    assert received_frozen_input.multi_modal_kwargs is None
-    assert (frozen_model_input.multi_modal_kwargs ==
-            frozen_model_input.multi_modal_kwargs)
-    assert received_frozen_input.lora_requests is None
-    assert (received_frozen_input.lora_requests ==
-            frozen_model_input.lora_requests)
-    assert received_frozen_input.lora_mapping is None
-    assert (
-        received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_frozen_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # For sampling metadata, only selected_token_indices is copied.
-    assert (received_frozen_input.sampling_metadata.selected_token_indices ==
-            sampling_metadata.selected_token_indices)
-    assert received_frozen_input.sampling_metadata.seq_groups is None
-
-    # check non frozen fields
-    assert received_model_input.is_last_step == model_input.is_last_step
-    assert (received_model_input.is_first_multi_step ==
-            model_input.is_first_multi_step)
-    assert received_model_input.current_step == model_input.current_step
-    assert (received_model_input.last_sampled_token_ids ==
-            model_input.last_sampled_token_ids).all()
-    assert received_model_input.is_multi_step == model_input.is_multi_step
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index df4eb33f5d45..6649cd89ee34 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3779,8 +3779,6 @@ def __str__(self):
             f"observability_config={self.observability_config!r}, "
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "
-            f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, "
-            f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, "  # noqa
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
             f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
             f"use_async_output_proc={self.model_config.use_async_output_proc}, "
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 61346da145bb..63894e7f5dc8 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -929,8 +929,7 @@ def _schedule_swapped(
         )
 
     def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
-        if (self.scheduler_config.chunked_prefill_enabled
-                and not self.scheduler_config.is_multi_step):
+        if self.scheduler_config.chunked_prefill_enabled:
             prompt_limit = self.scheduler_config.max_model_len
         else:
             prompt_limit = min(
@@ -1114,9 +1113,6 @@ def _schedule_prefills(
                 continue
 
             num_lookahead_slots: int = 0
-            if self.scheduler_config.is_multi_step and enable_chunking:
-                num_lookahead_slots = self._get_num_lookahead_slots(
-                    True, enable_chunking)
 
             # If the sequence group cannot be allocated, stop.
             can_allocate = self.block_manager.can_allocate(
@@ -1195,24 +1191,6 @@ def _schedule_prefills(
                 partial_prefill_metadata.maybe_increment_partial_prefills(
                     seq_group)
 
-            if enable_chunking and self.scheduler_config.is_multi_step:
-                blocks_to_copy: List[Tuple[int, int]] = []
-                # init_multi_step_from_lookahead_slots happens in append_slots
-                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
-                # This assert will trip when a copy-on-write happens. This is
-                # not a concern as the very first sequence-group block
-                # allocation happens above. Still, we have the assert to
-                # catch any edge-cases.
-                assert not blocks_to_copy
-            else:
-                seq_group.init_multi_step_from_lookahead_slots(
-                    num_lookahead_slots,
-                    num_scheduler_steps=self.scheduler_config.
-                    num_scheduler_steps,
-                    is_multi_step=self.scheduler_config.is_multi_step,
-                    enable_chunking=enable_chunking,
-                )
-
             seq_groups.append(
                 ScheduledSequenceGroup(seq_group=seq_group,
                                        token_chunk_size=num_new_tokens))
@@ -1453,14 +1431,6 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         num_prefill_groups = (len(prefills.seq_groups) +
                               len(swapped_in.prefill_seq_groups) +
                               len(running_scheduled.prefill_seq_groups))
-        # If all prompts, then we set num_lookahead_slots to 0
-        # this allows us to go through the `no_spec` path in
-        # `spec_decode_worker.py`
-        all_prefills = len(scheduled_seq_groups) == num_prefill_groups
-        num_lookahead_slots = (0 if
-                               (all_prefills
-                                and not self.scheduler_config.is_multi_step)
-                               else running_scheduled.num_lookahead_slots)
         return SchedulerOutputs(
             scheduled_seq_groups=scheduled_seq_groups,
             num_prefill_groups=num_prefill_groups,
@@ -1472,7 +1442,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
             swapped_in.blocks_to_copy,
             ignored_seq_groups=prefills.ignored_seq_groups +
             swapped_in.infeasible_seq_groups,
-            num_lookahead_slots=num_lookahead_slots,
+            num_lookahead_slots=0,
             running_queue_size=len(self.running),
             preempted=(len(running_scheduled.preempted) +
                        len(running_scheduled.swapped_out)),
@@ -1516,11 +1486,6 @@ def _can_append_slots(self, seq_group: SequenceGroup,
         num_lookahead_slots = self._get_num_lookahead_slots(
             is_prefill, enable_chunking)
 
-        if is_prefill and num_lookahead_slots > 0:
-            # Appending prefill slots only happens multi-step and
-            # chunked-prefill are enabled together.
-            assert self.scheduler_config.is_multi_step and enable_chunking
-
         return self.block_manager.can_append_slots(
             seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
 
@@ -1776,19 +1741,7 @@ def _append_slots(
         num_lookahead_slots: int = self._get_num_lookahead_slots(
             is_prefill, enable_chunking)
 
-        seq_group.init_multi_step_from_lookahead_slots(
-            num_lookahead_slots,
-            num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
-            is_multi_step=self.scheduler_config.is_multi_step,
-            enable_chunking=enable_chunking,
-        )
-
         seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
-        if self.scheduler_config.is_multi_step and enable_chunking:
-            # In multi-step chunked-prefill any sequence type can have
-            # slots appended.
-            seq_status = None
-
         for seq in seq_group.get_seqs(status=seq_status):
             cows = self.block_manager.append_slots(seq, num_lookahead_slots)
             if len(cows) > 0:
@@ -1904,29 +1857,8 @@ def _get_num_lookahead_slots(self, is_prefill: bool,
         """The number of slots to allocate per sequence per step, beyond known
         token ids. Speculative decoding uses these slots to store KV activations
         of tokens which may or may not be accepted.
-
-        Speculative decoding does not yet support prefill, so we do not perform
-        lookahead allocation for prefill.
-
-        When chunking is enabled with multi-step, we allocate lookahead slots
-        for the prefills for when the prefills turn into decodes in the first
-        step.
         """
-        if is_prefill:
-            if self.scheduler_config.is_multi_step and enable_chunking:
-                # num_lookahead_slots was introduced in the context of decodes,
-                # in Speculative Decoding.
-                # When the num_scheduler_steps is 8, say, then the
-                # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
-                # decode anyways and we wish to do 7 more.
-                #
-                # "lookaheads" for prefills, is introduced in support for
-                # Chunked-Prefill in Multi-Step.
-                return self.scheduler_config.num_lookahead_slots + 1
-            else:
-                return 0
-
-        return self.scheduler_config.num_lookahead_slots
+        return 0
 
     def _get_num_new_uncached_and_cached_tokens(
         self,
@@ -2068,24 +2000,6 @@ def _chunk_new_tokens_to_schedule(
             The number of new tokens to schedule after chunking.
         """
         remaining_token_budget = budget.remaining_token_budget()
-        if scheduler_config.is_multi_step:
-            # The current multi-step + chunked prefill capability does
-            # not actually support chunking prompts.
-            #
-            # Therefore, `num_new_tokens` is computed in the same fashion
-            # for both multi-step+chunked-prefill &
-            # multi-step+chunked-prefill+APC
-            #
-            # Prompts with more tokens than the current remaining budget
-            # are postponed to future scheduler steps
-            if num_new_tokens > prompt_limit:
-                # If the seq_group is in prompt-stage, pass the
-                # num_new_tokens as-is so the caller can ignore
-                # the sequence.
-                return num_new_tokens
-
-            return 0 if num_new_tokens > \
-                remaining_token_budget else num_new_tokens
 
         # Get the number of tokens to allocate to this prefill slot
         prefill_slot_budget = (
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d74db67bda0d..c058001ceb97 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -362,8 +362,6 @@ class EngineArgs:
     lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
     lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
 
-    num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
-    multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: Optional[
         int] = CacheConfig.num_gpu_blocks_override
@@ -799,11 +797,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                      **scheduler_kwargs["delay_factor"])
         scheduler_group.add_argument("--preemption-mode",
                                      **scheduler_kwargs["preemption_mode"])
-        scheduler_group.add_argument("--num-scheduler-steps",
-                                     **scheduler_kwargs["num_scheduler_steps"])
-        scheduler_group.add_argument(
-            "--multi-step-stream-outputs",
-            **scheduler_kwargs["multi_step_stream_outputs"])
+        # multi-step scheduling has been removed; corresponding arguments
+        # are no longer supported.
         scheduler_group.add_argument("--scheduling-policy",
                                      **scheduler_kwargs["policy"])
         scheduler_group.add_argument(
@@ -1257,28 +1252,11 @@ def create_engine_config(
             disable_log_stats=self.disable_log_stats,
         )
 
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        if self.num_scheduler_steps > 1:
-            if speculative_config is not None:
-                raise ValueError("Speculative decoding is not supported with "
-                                 "multi-step (--num-scheduler-steps > 1)")
-            if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
-                raise ValueError("Multi-Step Chunked-Prefill is not supported "
-                                 "for pipeline-parallel-size > 1")
-            if current_platform.is_cpu():
-                logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
-                               "currently not supported for CPUs and has been "
-                               "disabled.")
-                self.num_scheduler_steps = 1
-
-        # make sure num_lookahead_slots is set the higher value depending on
-        # if we are using speculative decoding or multi-step
-        num_lookahead_slots = max(self.num_lookahead_slots,
-                                  self.num_scheduler_steps - 1)
-        num_lookahead_slots = num_lookahead_slots \
-            if speculative_config is None \
-            else speculative_config.num_lookahead_slots
+        # make sure num_lookahead_slots is set appropriately depending on
+        # whether speculative decoding is enabled
+        num_lookahead_slots = self.num_lookahead_slots
+        if speculative_config is not None:
+            num_lookahead_slots = speculative_config.num_lookahead_slots
 
         scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
@@ -1292,8 +1270,6 @@ def create_engine_config(
             disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,
             preemption_mode=self.preemption_mode,
-            num_scheduler_steps=self.num_scheduler_steps,
-            multi_step_stream_outputs=self.multi_step_stream_outputs,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
             policy=self.scheduling_policy,
@@ -1392,11 +1368,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=True)
             return False
 
-        if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
-            _raise_or_fallback(feature_name="--num-scheduler-steps",
-                               recommend_to_remove=True)
-            return False
-
         if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
             _raise_or_fallback(feature_name="--scheduler-delay-factor",
                                recommend_to_remove=True)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 1f962b008ee0..b6ee4105340a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -15,7 +15,7 @@
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
-from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
+from vllm.engine.llm_engine import LLMEngine
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
 from vllm.executor.executor_base import ExecutorBase
@@ -308,13 +308,6 @@ async def step_async(
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
 
-            if (self.scheduler_config.is_multi_step
-                    and scheduler_outputs.num_lookahead_slots > 0):
-                # cache the scheduler outputs for the next iteration if we have
-                # lookahead slots
-                self._cache_scheduler_outputs_for_multi_step(
-                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
-                    allow_async_output_proc)
         else:
             finished_requests_ids = list()
 
@@ -351,29 +344,14 @@ async def step_async(
             outputs = await self.model_executor.execute_model_async(
                 execute_model_req)
 
-            # we need to do this here so that last step's sampled_token_ids can
-            # be passed to the next iteration for PP.
-            if self.scheduler_config.is_multi_step:
-                self._update_cached_scheduler_output(virtual_engine, outputs)
         else:
             if len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
             outputs = []
 
-        # Finish the current step for all the sequence groups.
-        if self.scheduler_config.is_multi_step:
-            for seq_group in seq_group_metadata_list:
-                seq_group.finish_step()
-
         if not self._has_remaining_steps(seq_group_metadata_list):
-            # Clear the cache if we have finished all the steps
-            if self.scheduler_config.is_multi_step:
-                self.cached_scheduler_outputs[
-                    virtual_engine] = SchedulerOutputState()
-
             # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1. When the num_steps > 1,
-            # multi_step_model_runner does the first-step output append.
+            # the sequences are 1.
             is_first_step_output: bool = False if not seq_group_metadata_list \
                 else seq_group_metadata_list[0].state.num_steps == 1
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3fc4f6445df2..bbe958351e87 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -25,7 +25,6 @@
 from vllm.engine.output_processor.interfaces import (
     SequenceGroupOutputProcessor)
 from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.entrypoints.openai.logits_processors import (
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
@@ -91,7 +90,7 @@ class OutputData(NamedTuple):
 
 class SchedulerContext:
 
-    def __init__(self, multi_step_stream_outputs: bool = False):
+    def __init__(self) -> None:
         self.output_queue: Deque[OutputData] = deque()
         self.request_outputs: List[Union[RequestOutput,
                                          PoolingRequestOutput]] = []
@@ -99,8 +98,6 @@ def __init__(self, multi_step_stream_outputs: bool = False):
             List[SequenceGroupMetadata]] = None
         self.scheduler_outputs: Optional[SchedulerOutputs] = None
 
-        self.multi_step_stream_outputs: bool = multi_step_stream_outputs
-
     def append_output(self, outputs: List[SamplerOutput],
                       seq_group_metadata_list: List[SequenceGroupMetadata],
                       scheduler_outputs: SchedulerOutputs, is_async: bool,
@@ -303,8 +300,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
         ]
 
         self.scheduler_contexts = [
-            SchedulerContext(multi_step_stream_outputs=self.scheduler_config.
-                             multi_step_stream_outputs)
+            SchedulerContext()
             for _ in range(self.parallel_config.pipeline_parallel_size)
         ]
 
@@ -683,8 +679,7 @@ def add_request(
                              "Priority scheduling is not enabled.")
 
         if isinstance(params, SamplingParams) \
-            and params.logits_processors \
-            and self.scheduler_config.num_scheduler_steps > 1:
+            and params.logits_processors:
             raise ValueError(
                 "Logits processors are not supported in multi-step decoding")
 
@@ -868,45 +863,6 @@ def _process_sequence_group_outputs(
 
         return
 
-    def _update_num_computed_tokens_for_multi_step_prefill(
-            self, seq_group: SequenceGroup,
-            seq_group_meta: SequenceGroupMetadata,
-            is_first_step_output: Optional[bool]):
-        """
-        This function updates num_computed_tokens for prompt sequences
-        when Multi-Step is enabled.
-
-        seq_group: SequenceGroup to update the num_computed_tokens for.
-        seq_group_meta: Metadata of the given SequenceGroup.
-        is_first_step_output: Optional[bool] -
-            When available, is_first_step_output indicates if the appended
-            output token is the output of the first-step in multi-step.
-            A value of None indicates that outputs from all steps in
-            in multi-step are submitted in a single burst.
-        """
-
-        assert self.scheduler_config.is_multi_step
-
-        if not seq_group_meta.is_prompt:
-            # num_computed_token updates for multi-step decodes happen after
-            # the tokens are appended to the sequence.
-            return
-
-        do_update: bool = False
-        if self.scheduler_config.chunked_prefill_enabled:
-            # In multi-step + chunked-prefill case, the prompt sequences
-            # that are scheduled are fully processed in the first step.
-            do_update = is_first_step_output is None or is_first_step_output
-        else:
-            # Normal multi-step decoding case. In this case prompt-sequences
-            # are actually single-stepped. Always update in this case.
-            assert seq_group.state.num_steps == 1
-            do_update = True
-
-        if do_update:
-            seq_group.update_num_computed_tokens(
-                seq_group_meta.token_chunk_size)
-
     def _process_model_outputs(self,
                                ctx: SchedulerContext,
                                request_id: Optional[str] = None) -> None:
@@ -939,33 +895,8 @@ def _process_model_outputs(self,
 
         has_multiple_outputs: bool = len(outputs) > 1
         outputs_by_sequence_group: List[List[SequenceGroupOutput]]
-        if has_multiple_outputs:
-            assert self.scheduler_config.is_multi_step or \
-                     self.speculative_config
-            # Organize outputs by [step][sequence group] instead of
-            # [sequence group][step].
-            if self.scheduler_config.is_multi_step:
-                outputs_by_sequence_group = create_output_by_sequence_group(
-                    outputs, len(seq_group_metadata_list))
-            elif self.speculative_config:
-                # Decodes are multi-steps while prefills are not, outputting at
-                # most 1 token. Separate them so that we can trigger chunk
-                # processing without having to pad or copy over prompts K times
-                # to match decodes structure (costly with prompt_logprobs).
-                num_prefills = sum(sg.is_prompt
-                                   for sg in seq_group_metadata_list)
-                prefills, decodes = outputs[:num_prefills], outputs[
-                    num_prefills:]
-                outputs_by_sequence_group = create_output_by_sequence_group(
-                    decodes,
-                    num_seq_groups=len(seq_group_metadata_list) - num_prefills)
-                outputs_by_sequence_group = [p.outputs for p in prefills
-                                             ] + outputs_by_sequence_group
-            # We have outputs for multiple steps submitted in a single burst,
-            # so invalidate is_first_step_output.
-            is_first_step_output = None
-        else:
-            outputs_by_sequence_group = outputs
+        assert not has_multiple_outputs
+        outputs_by_sequence_group = outputs
 
         # Determine the requests we need to operate on
         if request_id:
@@ -1006,13 +937,8 @@ def _process_model_outputs(self,
                 output = [outputs_by_sequence_group[0][i]]
 
             if not is_async:
-                if self.scheduler_config.is_multi_step:
-                    # Updates happen only if the sequence is prefill
-                    self._update_num_computed_tokens_for_multi_step_prefill(
-                        seq_group, seq_group_meta, is_first_step_output)
-                else:
-                    seq_group.update_num_computed_tokens(
-                        seq_group_meta.token_chunk_size or 0)
+                seq_group.update_num_computed_tokens(
+                    seq_group_meta.token_chunk_size or 0)
 
             if outputs:
                 for o in outputs:
@@ -1074,15 +1000,6 @@ def _process_model_outputs(self,
             for scheduler in self.scheduler:
                 scheduler.free_finished_seq_groups()
 
-        # For multi-step without streaming, don't create outputs each iteration
-        if not is_last_step and not ctx.multi_step_stream_outputs:
-            # Immediately process request outputs here (if callback is given)
-            if (finished_now
-                    and self.process_request_outputs_callback is not None):
-                self.process_request_outputs_callback(ctx.request_outputs)
-                ctx.request_outputs.clear()
-            return
-
         # Create the outputs
         for i in indices:
             if i in skip or i in finished_before or i in finished_now:
@@ -1101,13 +1018,7 @@ def _process_model_outputs(self,
             if request_output:
                 ctx.request_outputs.append(request_output)
 
-        # For multi-step with streaming, create outputs each iteration
-        if not is_last_step and ctx.multi_step_stream_outputs:
-            # Immediately process request outputs here (if callback is given)
-            if self.process_request_outputs_callback is not None:
-                self.process_request_outputs_callback(ctx.request_outputs)
-                ctx.request_outputs.clear()
-            return
+        # Create outputs only after processing the scheduler's results
 
         for seq_group in scheduler_outputs.ignored_seq_groups:
             params = seq_group.sampling_params
@@ -1157,16 +1068,10 @@ def _advance_to_next_step(
             if seq_group.is_finished():
                 continue
 
-            if self.scheduler_config.is_multi_step:
-                # Updates happen only if the sequence is prefill
-                self._update_num_computed_tokens_for_multi_step_prefill(
-                    seq_group, seq_group_metadata,
-                    seq_group.state.num_steps == 1)
-            else:
-                token_chunk_size = (seq_group_metadata.token_chunk_size
-                                    if seq_group_metadata.token_chunk_size
-                                    is not None else 0)
-                seq_group.update_num_computed_tokens(token_chunk_size)
+            token_chunk_size = (seq_group_metadata.token_chunk_size
+                                if seq_group_metadata.token_chunk_size
+                                is not None else 0)
+            seq_group.update_num_computed_tokens(token_chunk_size)
 
             if seq_group_metadata.do_sample:
                 assert len(sequence_group_outputs.samples) == 1, (
@@ -1177,16 +1082,8 @@ def _advance_to_next_step(
                 assert len(seq_group.seqs) == 1
                 seq = seq_group.seqs[0]
 
-                if self.scheduler_config.is_multi_step:
-                    is_prefill_append = seq.data.get_num_uncomputed_tokens(
-                    ) == 0
-                    seq.append_token_id(sample.output_token, sample.logprobs,
-                                        sample.output_embed)
-                    if not is_prefill_append:
-                        seq_group.update_num_computed_tokens(1)
-                else:
-                    seq.append_token_id(sample.output_token, sample.logprobs,
-                                        sample.output_embed)
+                seq.append_token_id(sample.output_token, sample.logprobs,
+                                    sample.output_embed)
 
     def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
@@ -1289,13 +1186,6 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
                 self._process_model_outputs(ctx=ctx)
 
-            if (self.scheduler_config.is_multi_step
-                    and scheduler_outputs.num_lookahead_slots > 0):
-                # cache the scheduler outputs for the next iteration if we have
-                # lookahead slots
-                self._cache_scheduler_outputs_for_multi_step(
-                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
-                    allow_async_output_proc)
         else:
             finished_requests_ids = list()
 
@@ -1345,10 +1235,6 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
                 # Raise so the caller is notified that this request failed
                 raise
 
-            # We need to do this here so that last step's sampled_token_ids can
-            # be passed to the next iteration for PP.
-            if self.scheduler_config.is_multi_step:
-                self._update_cached_scheduler_output(virtual_engine, outputs)
         else:
             # Nothing scheduled => If there is pending async postprocessor,
             # then finish it here.
@@ -1357,19 +1243,9 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
             # No outputs in this case
             outputs = []
 
-        # Finish the current step for all the sequence groups.
-        if self.scheduler_config.is_multi_step:
-            for seq_group in seq_group_metadata_list:
-                seq_group.finish_step()
-
         if not self._has_remaining_steps(seq_group_metadata_list):
-            # clear the cache if we have finished all the steps.
-            if self.scheduler_config.is_multi_step:
-                self.cached_scheduler_outputs[0] = SchedulerOutputState()
-
             # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1. When the num_steps > 1,
-            # multi_step_model_runner does the first-step output append.
+            # the sequences are 1.
             is_first_step_output: bool = False if not seq_group_metadata_list \
                 else seq_group_metadata_list[0].state.num_steps == 1
 
@@ -1453,22 +1329,7 @@ def _abort_and_cache_schedule(
     def _has_remaining_steps(
         self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
     ) -> bool:
-        if (not self.scheduler_config.is_multi_step
-                or not seq_group_metadata_list):
-            return False
-
-        # TODO(will) this is a sanity check for nowto make sure that all the
-        # seqs are on the same steps. Eventually we will want to do some sort of
-        # dynamic scheduling when doing multi-step decoding.
-        ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps
-        if any([
-                seq_group.state.remaining_steps != ref_remaining_steps
-                for seq_group in seq_group_metadata_list[1:]
-        ]):
-            raise AssertionError("All running sequence groups should "
-                                 "have the same remaining steps.")
-
-        return ref_remaining_steps > 0
+        return False
 
     def _cache_scheduler_outputs_for_multi_step(
             self, virtual_engine: int,
@@ -1497,13 +1358,6 @@ def _update_cached_scheduler_output(
 
     def _get_last_sampled_token_ids(
             self, virtual_engine: int) -> Optional[torch.Tensor]:
-        cached_last_output = self.cached_scheduler_outputs[
-            virtual_engine].last_output
-        if (self.scheduler_config.is_multi_step
-                and self.parallel_config.pipeline_parallel_size > 1
-                and cached_last_output is not None
-                and cached_last_output.sampled_token_ids_cpu is not None):
-            return cached_last_output.sampled_token_ids_cpu
         return None
 
     def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
index 19c5963d32db..4d75719c1719 100644
--- a/vllm/engine/output_processor/interfaces.py
+++ b/vllm/engine/output_processor/interfaces.py
@@ -36,27 +36,13 @@ def create_output_processor(
     ):
         """Create an output processor.
 
-        This returns a single-step output processor if num_lookahead_slots is
-        zero, else returns a multi-step output processor.
+        Multi-step scheduling is no longer supported. Always return a
+        single-step output processor.
         """
-        if scheduler_config.num_lookahead_slots == 0:
-            # Importing here to avoid cycle.
-            from vllm.engine.output_processor.single_step import (
-                SingleStepOutputProcessor)
-            return SingleStepOutputProcessor(scheduler_config, detokenizer,
-                                             scheduler, seq_counter,
-                                             stop_checker)
-        else:
-            # Importing here to avoid cycle.
-            from vllm.engine.output_processor.multi_step import (
-                MultiStepOutputProcessor)
-            return MultiStepOutputProcessor(
-                detokenizer,
-                scheduler,
-                seq_counter,
-                get_tokenizer_for_seq,
-                stop_checker,
-            )
+        from vllm.engine.output_processor.single_step import (
+            SingleStepOutputProcessor)
+        return SingleStepOutputProcessor(scheduler_config, detokenizer,
+                                         scheduler, seq_counter, stop_checker)
 
     @abstractmethod
     def process_outputs(self, sequence_group: SequenceGroup,
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
deleted file mode 100644
index 8b66ef0dc765..000000000000
--- a/vllm/engine/output_processor/multi_step.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import functools
-from typing import Callable, List, cast
-
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.interfaces import (
-    SequenceGroupOutputProcessor)
-from vllm.engine.output_processor.single_step import (
-    single_step_process_prompt_logprob)
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
-                           CompletionSequenceGroupOutput, Sequence,
-                           SequenceGroup, SequenceGroupOutput, SequenceOutput,
-                           SequenceStatus)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import Counter
-
-logger = init_logger(__name__)
-
-
-class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
-    """SequenceGroupOutputProcessor which handles logic related to
-    detokenization and stopping conditions. It specializes to "multi-step
-    decoding", where vLLM's worker may generate multiple tokens per invocation.
-    This is currently mutually exclusive with advanced sampling techniques like
-    beam search, which motivates the separation of this logic from the single
-    step output processor.
-
-    This class is responsible for things such as correctly appending all new
-    token ids to their sequence, detokenizing new token ids, truncating new
-    output tokens after an eos token, and correctly handling the case where the
-    number of new output tokens per sequence differs in a single batch.
-    """
-
-    def __init__(
-        self,
-        detokenizer: Detokenizer,
-        scheduler: List[Scheduler],
-        seq_counter: Counter,
-        get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
-        stop_checker: StopChecker,
-    ):
-        self.detokenizer = detokenizer
-        self.scheduler = scheduler
-        self.seq_counter = seq_counter
-        self.get_tokenizer_for_seq = get_tokenizer_for_seq
-        self.stop_checker = stop_checker
-
-    def process_prompt_logprob(self, seq_group: SequenceGroup,
-                               outputs: List[SequenceGroupOutput]) -> None:
-        """Process prompt logprobs associated with each step of a multi-step-
-        scheduled computation.
-
-        Args:
-          seq_group: the outputs are associated with this
-              [`SequenceGroup`][vllm.sequence.SequenceGroup]
-          outputs: the
-              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
-              for all scheduler steps
-        """
-        for output in outputs:
-            # Concatenate single-step prompt logprob processing results.
-            assert isinstance(output, CompletionSequenceGroupOutput)
-            single_step_process_prompt_logprob(self, seq_group, output)
-
-    @staticmethod
-    @functools.lru_cache
-    def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        logger.warning(
-            "Prompt logprob is not supported by multi step workers. "
-            "(e.g., speculative decode uses multi step workers).")
-
-    def process_outputs(self,
-                        sequence_group: SequenceGroup,
-                        outputs: List[SequenceGroupOutput],
-                        is_async: bool = False) -> None:
-        """Append new tokens in the outputs to sequences in the sequence group.
-
-        This only supports sequence groups of size 1. It supports greater than
-        one new token per sequence.
-
-        This applies logic like stop condition checking and detokenization.
-        It also handles cases where there are tokens emitted after 
-        the EOS token.
-
-        is_async - Indicates whether this postprocessor runs in 
-            parallel with the GPU forward pass and is processing 
-            tokens from the previous step. If this is true, then
-            no tokens need to be appended since it is already done
-            externally (before the next schedule() call)
-        """
-        # Sequences can be in RUNNING or FINISHED_ABORTED state
-        # once scheduled, as a sequence is moved to FINISHED_ABORTED
-        # if a client disconnects from the api server.
-        seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
-        if seqs is None:
-            seqs = sequence_group.get_seqs(
-                status=SequenceStatus.FINISHED_ABORTED)
-
-        assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences"
-        assert len(seqs) == 1, (
-            "Beam search not supported in multi-step decoding.")
-        seq = seqs[0]
-        seq_id = seq.seq_id
-        # This method is defined in the more generic
-        # SequenceGroupOutputProcessor, but here we assume that the outputs are
-        # of a more specific type.
-        assert all([
-            isinstance(output, CompletionSequenceGroupOutput)
-            for output in outputs
-        ])
-        compl_outputs = cast(List[CompletionSequenceGroupOutput], outputs)
-        assert all([
-            seq_id == output.samples[0].parent_seq_id
-            for output in compl_outputs
-        ])
-
-        if is_async:
-            # Async case: We process tokens one by one. Here, we know the token
-            # was already appended, so we only need to do the rest of the
-            # postprocessor: Detokenization + stopping logic
-            self._process_decode_and_stop(seq, sequence_group.sampling_params)
-        else:
-            # Standard multi-step case
-
-            # Since there's only one sequence per sequence group,
-            # we can take the first sample.
-            samples = [output.samples[0] for output in compl_outputs]
-
-            # entries in sample tokens may be invalid (eg. due to spec decode
-            # rejecting tokens).
-            valid_samples = [
-                sample for sample in samples
-                if sample.output_token != VLLM_INVALID_TOKEN_ID
-            ]
-
-            # When both spec-decode and pre-fill chunking are enabled, we
-            # don't have guaranteed samples here (e.g. all -1s).
-            if valid_samples:
-                self._process_seq_outputs(seq, valid_samples,
-                                          sequence_group.sampling_params)
-
-    def _process_decode_and_stop(self, seq: Sequence,
-                                 sampling_params: SamplingParams) -> None:
-        new_char_count = 0
-        if sampling_params.detokenize and self.detokenizer:
-            new_char_count = self.detokenizer.decode_sequence_inplace(
-                seq, sampling_params)
-
-        # TODO(sang): Support lora.
-        self.stop_checker.maybe_stop_sequence(
-            seq,
-            new_char_count=new_char_count,
-            sampling_params=sampling_params,
-        )
-
-    def _process_seq_outputs(self, seq: Sequence,
-                             valid_samples: List[SequenceOutput],
-                             sampling_params: SamplingParams) -> None:
-        output_token_ids = [sample.output_token for sample in valid_samples]
-        output_logprobs = [sample.logprobs for sample in valid_samples]
-        output_embeds = [sample.output_embed for sample in valid_samples]
-
-        # Truncate to max_tokens if necessary.
-        remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
-                                                         len(output_token_ids))
-        if remaining_tokens < 0:
-            output_token_ids = output_token_ids[:remaining_tokens]
-
-        # Truncate any tokens after EOS. This is required as spec decode
-        # generates a fixed number of tokens without evaluating stopping
-        # conditions within the block. This can cause an eos token to be
-        # unintentionally ignored.
-        if not sampling_params.ignore_eos and self.detokenizer:
-            eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
-            # Avoiding .index calls as exception throwing in the happy path
-            # is expensive.
-            for i in range(len(output_token_ids)):
-                if output_token_ids[i] == eos_token_id:
-                    output_token_ids = output_token_ids[:i + 1]
-                    break
-
-        is_prefill_sampled_token = seq.data.get_num_uncomputed_tokens() == 0
-        # Incrementally append tokens to the sequence, as if we had only one new
-        # token.
-        for output_token_id, output_logprob, output_embed in zip(
-                output_token_ids, output_logprobs, output_embeds):
-            seq.append_token_id(
-                token_id=output_token_id,
-                logprobs=output_logprob,
-                token_embed=output_embed,
-            )
-
-            if is_prefill_sampled_token:
-                is_prefill_sampled_token = False
-            else:
-                # Update num_computed_tokens iff the sampled token is not from
-                # a prefill step.
-                seq.data.update_num_computed_tokens(1)
-
-            self._process_decode_and_stop(seq, sampling_params)
-
-            if seq.is_finished():
-                break
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c876c52a2e9c..70959131573f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -118,20 +118,10 @@ def log_warnings(cls):
     @classmethod
     def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         parallel_config = vllm_config.parallel_config
-        scheduler_config = vllm_config.scheduler_config
         model_config = vllm_config.model_config
 
         if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                if envs.VLLM_USE_V1:
-                    raise NotImplementedError(
-                        "Multi-step scheduling is not supported (and not "
-                        "needed) on vLLM V1. Please launch without "
-                        "--num-scheduler-steps.")
-                else:
-                    parallel_config.worker_cls = \
-                        "vllm.worker.multi_step_worker.MultiStepWorker"
-            elif vllm_config.speculative_config:
+            if vllm_config.speculative_config:
                 if not envs.VLLM_USE_V1:
                     raise NotImplementedError(
                         "Speculative decoding is not supported on vLLM V0.")
@@ -139,7 +129,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             else:
                 if envs.VLLM_USE_V1:
                     parallel_config.worker_cls = \
-                            "vllm.v1.worker.gpu_worker.Worker"
+                        "vllm.v1.worker.gpu_worker.Worker"
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 8005830f55ce..2d5bee5fc505 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -327,18 +327,8 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             cache_config.block_size = 16
 
         parallel_config = vllm_config.parallel_config
-        scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                if envs.VLLM_USE_V1:
-                    raise NotImplementedError(
-                        "Multi-step scheduling is not supported (and not "
-                        "needed) on vLLM V1. Please launch without "
-                        "--num-scheduler-steps.")
-                else:
-                    parallel_config.worker_cls = \
-                        "vllm.worker.multi_step_worker.MultiStepWorker"
-            elif vllm_config.speculative_config:
+            if vllm_config.speculative_config:
                 if not envs.VLLM_USE_V1:
                     raise NotImplementedError(
                         "Speculative decoding is not supported on vLLM V0.")
@@ -346,7 +336,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             else:
                 if envs.VLLM_USE_V1:
                     parallel_config.worker_cls = \
-                            "vllm.v1.worker.gpu_worker.Worker"
+                        "vllm.v1.worker.gpu_worker.Worker"
                 else:
                     parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index c56096d93612..c7522a89c257 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -133,18 +133,13 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                raise NotImplementedError(
-                    "Multi-step scheduling is not supported (and not "
-                    "needed) on vLLM V1. Please launch without "
-                    "--num-scheduler-steps.")
             parallel_config.worker_cls = "vllm.v1.worker.tpu_worker.TPUWorker"
 
         assert not vllm_config.speculative_config, (
             "Speculative decoding is not yet supported for TPU backend")
 
         if scheduler_config.is_multimodal_model and not \
-            scheduler_config.disable_chunked_mm_input:
+                scheduler_config.disable_chunked_mm_input:
             logger.warning("TPU does not support running Multimodal models"\
             " without setting `--disable_chunked_mm_input`. " \
             "Forcing --disable_chunked_mm_input.")
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 6e65a2bd0318..cbe63f8d1d4e 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -794,35 +794,6 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
-    def init_multi_step(self, num_steps: int) -> None:
-        self.state.num_steps = num_steps
-        self.state.current_step = 0
-
-    def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
-                                             num_scheduler_steps: int,
-                                             is_multi_step: bool,
-                                             enable_chunking: bool) -> None:
-
-        if not is_multi_step:
-            self.init_multi_step(num_steps=num_scheduler_steps)
-            return
-
-        # Multi-Step case
-        is_prefill = self.is_prefill()
-
-        # The asserts below reflect the expectations of the current system.
-        if is_prefill and enable_chunking:
-            assert num_lookahead_slots == num_scheduler_steps
-            self.init_multi_step(num_steps=num_lookahead_slots)
-        else:
-            is_decode: bool = not is_prefill
-            # If it is a prefill, num_lookahead_slots must be 0
-            assert num_lookahead_slots == 0 or is_decode
-            # If it is a decode, num_lookahead_slots + 1 must match
-            # the scheduler steps.
-            assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
-            self.init_multi_step(num_steps=num_lookahead_slots + 1)
-
     def set_last_token_time(self, now: float) -> None:
         """Sets the last token time for Request level timings."""
         # If still in prefill phase, assertion fails.
@@ -1367,15 +1338,6 @@ class ExecuteModelRequest(
     # Async callback
     async_callback: Optional[Callable] = None
 
-    @property
-    def is_first_multi_step(self) -> bool:
-        # TODO(will) make this be able to handle batches with variable number of
-        # steps
-        assert len(self.seq_group_metadata_list) > 0
-        first_seq_group = self.seq_group_metadata_list[0]
-        assert first_seq_group.state is not None
-        return first_seq_group.state.current_step == 0
-
     @property
     def is_last_step(self) -> bool:
         # TODO(will) make this be able to handle batches with variable number of
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 20b9b733cd3b..a63797e3a46a 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -508,8 +508,7 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
         if inter_data.is_prompt:
             context_len = seq_data.get_num_computed_tokens()
             seq_len = min(seq_len, context_len + token_chunk_size)
-        elif self.runner.scheduler_config.is_multi_step or \
-            self.runner.model_config.is_encoder_decoder:
+        elif self.runner.model_config.is_encoder_decoder:
             context_len = seq_len - 1
         else:
             context_len = seq_data.get_num_computed_tokens()
@@ -778,9 +777,7 @@ def _get_cuda_graph_pad_size(self,
             int: Returns the determined number of padding sequences. If
                 CUDA graphs is not viable, returns -1.
         """
-        is_mscp: bool = self.runner.scheduler_config.is_multi_step and \
-                    self.runner.scheduler_config.chunked_prefill_enabled
-        decode_only = self.decode_only or is_mscp
+        decode_only = self.decode_only
         if not decode_only:
             # Early exit so we can treat num_seqs as the batch_size below.
             return -1
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
deleted file mode 100644
index 2aa910bdff6b..000000000000
--- a/vllm/worker/multi_step_model_runner.py
+++ /dev/null
@@ -1,908 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-import functools
-from dataclasses import dataclass, field
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Union)
-
-import torch
-
-from vllm.distributed import get_pp_group
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs,
-                                                SamplerOutput,
-                                                SamplingMetadata, get_logprobs,
-                                                get_pythonized_sample_results)
-from vllm.platforms import current_platform
-from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
-                           Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
-from vllm.worker.model_runner import (GPUModelRunnerBase,
-                                      ModelInputForGPUWithSamplingMetadata)
-from vllm.worker.model_runner_base import (
-    BroadcastableModelInput, _init_attn_metadata_from_tensor_dict,
-    _init_frozen_model_input_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
-
-from ..model_executor.model_loader.tensorizer import TensorizerConfig
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-MULTI_STEP_ATTENTION_BACKENDS = [
-    "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
-]
-MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
-
-def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
-    -> List[str]:
-    if chunked_prefill_enabled:
-        return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS
-    else:
-        return MULTI_STEP_ATTENTION_BACKENDS
-
-
-def seq_output_builder():
-    return SequenceOutput(
-        0, 0,
-        {0: Logprob(logprob=float('inf'), rank=None, decoded_token=None)})
-
-
-def completion_seq_group_output_builder():
-    return CompletionSequenceGroupOutput([], None)
-
-
-# Used by pythonization to reduce python object allocations
-class PythonizationCache:
-
-    def __init__(self):
-        self.cached_seq_output = PyObjectCache(seq_output_builder)
-        self.cached_completion_seq_group_output = PyObjectCache(
-            completion_seq_group_output_builder)
-
-    def reset(self):
-        self.cached_seq_output.reset()
-        self.cached_completion_seq_group_output.reset()
-
-
-@dataclass
-class ModelOutput:
-    """The output of a single model forward pass.
-
-    The sampler_output_ready_event is set when the tensors in
-    sampler_output are ready (the model+sampler forward pass has
-    completed). We use the event to synchronize the GPU->CPU transfer,
-    which we want to only run when the data has been written to the
-    GPU tensors. Until the event is ready, the tensors in sampler_output
-    will have garbage data.
-
-    There are two scenarios:
-    1. The output tensors are ready and we can pythonize them immediately.
-    2. The output tensors are not ready and we need to wait for the event to be
-    ready.
-    """
-    sampler_output: SamplerOutput
-    sampler_output_ready_event: torch.cuda.Event
-    sampled_token_ids: Optional[torch.Tensor] = None
-    pythonized: bool = False
-    # On-device tensor containing the logprobs of each token.
-    logprobs: Optional["torch.Tensor"] = None
-    pythonization_cache: Optional[PythonizationCache] = None
-
-    def pythonize(self, input_metadata: "StatefulModelInput",
-                  copy_stream: torch.cuda.Stream,
-                  pinned_sampled_token_buffer: torch.Tensor) -> None:
-        """Pythonize the output. Blocking."""
-        if not self.pythonized:
-            self._pythonize_sampler_output(input_metadata, copy_stream,
-                                           pinned_sampled_token_buffer, True)
-            self.pythonized = True
-
-    def maybe_pythonize(self, input_metadata: "StatefulModelInput",
-                        copy_stream: torch.cuda.Stream,
-                        pinned_sampled_token_buffer: torch.Tensor) -> None:
-        """Pythonize the output if ready, else return None. Non-blocking."""
-        if not self.pythonized:
-            self.pythonized = self._pythonize_sampler_output(
-                input_metadata, copy_stream, pinned_sampled_token_buffer,
-                False)
-
-    def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput",
-                                  copy_stream: torch.cuda.Stream,
-                                  pinned_sampled_token_buffer: torch.Tensor,
-                                  blocking: bool) -> bool:
-        """
-        If blocking is set, will block until the forward pass for the output is
-        ready and pythonize the output. Upon completing Pythonization, erases
-        self.logprobs (note that a non-blocking call that is performed when
-        the sampler output is not yet ready, will not erase self.logprobs.)
-        """
-        assert self.sampled_token_ids is not None
-        if not blocking and not self.sampler_output_ready_event.query():
-            return False
-
-        if blocking:
-            self.sampler_output_ready_event.synchronize()
-        with torch.cuda.stream(copy_stream):
-            _pythonize_sampler_output(input_metadata, self.sampler_output,
-                                      pinned_sampled_token_buffer,
-                                      self.sampled_token_ids, self.logprobs,
-                                      self.pythonization_cache)
-
-        # Erase the logprobs GPU-side tensor.
-        # Note that although _pythonize_sampler_output() runs in its
-        # own CUDA stream, nonetheless _pythonize_sampler_output()
-        # cannot return until Pythonization is complete; therefore
-        # we know that by the time the CPU reaches this point,
-        # `self.logprobs` is no longer needed.
-        self.logprobs = None
-        return True
-
-
-@dataclass(frozen=False)
-class StatefulModelInput(BroadcastableModelInput):
-    # actual frozen model input dataclass passed to _base_model_runner
-    frozen_model_input: Optional[ModelInputForGPUWithSamplingMetadata] = None
-
-    # list of model outputs for each step, may not be all pythonized
-    cached_outputs: List[ModelOutput] = field(default_factory=list)
-
-    # used to pass sampled token ids from the last step to the current step for
-    # TP workers. Used to append to end of outputs and used by advance_step
-    last_sampled_token_ids: Optional[torch.Tensor] = None
-    current_step: int = 0
-    is_multi_step: bool = True
-    is_last_step: bool = False
-    is_first_multi_step: bool = False
-    base_output_proc_callback: Optional[Callable] = None
-    # ping-pong data structures for multi-step to wait on the previous step
-    step_cuda_events: List[current_platform.Event] = field(
-        default_factory=lambda: [current_platform.Event(blocking=True)] * 2)
-    num_seqs: int = -1
-    num_queries: int = -1
-    num_single_step_prefills: int = 0
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        assert self.frozen_model_input is not None
-        tensor_dict = self.frozen_model_input.as_broadcastable_tensor_dict()
-        new_tensor_dict = {
-            'last_sampled_token_ids': self.last_sampled_token_ids,
-            'current_step': self.current_step,
-            'is_multi_step': self.is_multi_step,
-            'is_last_step': self.is_last_step,
-            'is_first_multi_step': self.is_first_multi_step,
-            'num_seqs': self.num_seqs,
-            'num_queries': self.num_queries,
-            'num_single_step_prefills': self.num_single_step_prefills,
-        }
-        tensor_dict.update(new_tensor_dict)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "StatefulModelInput":
-        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        tensor_dict = _init_frozen_model_input_from_tensor_dict(
-            ModelInputForGPUWithSamplingMetadata, tensor_dict)
-
-        return cls(**tensor_dict)
-
-    def record_step_event(self, current_stream: torch.cuda.Stream):
-        # record the event for the current step so that the next step can sync
-        # on it. We modulo by 2 to keep the events in a circular buffer and
-        # support any attn backends that may be supported in the future. ie
-        # Flashinfer would want two DecodeWrappers to overlap the CPU and GPU.
-        self.step_cuda_events[self.current_step & 1] = \
-            torch.cuda.Event(blocking=True)
-        self.step_cuda_events[self.current_step & 1].record(current_stream)
-
-    def wait_previous_step(self):
-        # These cuda events are an explicit synchronization to ensure that
-        # advance_step() (for other attn backends that may be supported in the
-        # future) do not clobber any data structures that is also used by any
-        # enqueued forwards steps. For distributed case, only a single event is
-        # needed, but for single GPU case, since we can let the CPU run much
-        # further ahead, two events allow us to overlap the advance_step with
-        # the previous forward (ie using two DecodeWrappers for flashinfer
-        # backend)
-        self.step_cuda_events[(self.current_step + 1) & 1].wait()
-
-    def add_sampler_output(self,
-                           sampler_output: SamplerOutput,
-                           sampled_token_ids: Optional[torch.Tensor] = None):
-        self.cached_outputs.append(
-            ModelOutput(sampler_output=sampler_output,
-                        sampler_output_ready_event=None,
-                        sampled_token_ids=sampled_token_ids,
-                        pythonized=False))
-
-    def maybe_advance_sampling_metadata(self, device: str, pin_memory: bool):
-        """
-        sampling_metadata.selected_token_indices is constructed for the
-        first-step in Multi-Step. However, when chunked-prefill is enabled with
-        multi-step, the scheduled prompts are fully processed in the
-        first-step and are processed as decodes in the rest of the steps.
-        This function updates the sampling_metadata.selected_token_indices
-        to account for this conversion.
-
-        Example:
-        Let 2 prompts and 2 decodes be scheduled together. Let the
-        num-tokens to process for the 2 prompts be 5 and 8 respectively.
-
-        In that case, sampling_metadata.sampled_token_indices will be,
-        [4, 12, 13, 14] as it is constructed for the first-step in
-        multi-step.
-        However, the prompts turns to decodes after the first-step
-        and the num-tokens for the previously-prompt sequences will
-        be 1 and 1 as they are decodes now. The self.sampled_token_indices
-        must be updated to [0,1,2,3].
-        """
-        assert self.current_step == 1 and self.num_single_step_prefills > 0
-        if not get_pp_group().is_last_rank:
-            return
-
-        assert self.frozen_model_input is not None
-        assert self.frozen_model_input.sampling_metadata is not None
-        self.frozen_model_input.sampling_metadata.selected_token_indices =  \
-            async_tensor_h2d(list(range(self.num_queries)),
-                             dtype=torch.long,
-                             target_device=device,
-                             pin_memory=pin_memory)
-
-    def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool):
-        """
-        Advancing the datastructures of StatefulModelInput::frozen_model_input
-        is only required when prefills are scheduled with decodes to run in
-        multi-step. This advancement/correction is required to account for
-        the conversion of Prefills to Decodes after the first multi-step.
-        """
-        if self.current_step != 1 or self.num_single_step_prefills == 0:
-            return
-
-        assert self.frozen_model_input is not None
-        fmi = self.frozen_model_input
-
-        # Truncate input_tokens
-        assert fmi.input_tokens is not None
-        assert fmi.input_tokens.shape[0] >= self.num_seqs
-        fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
-
-        # Update frozen_model_input::input_positions.
-        assert fmi.input_positions is not None
-        assert fmi.input_positions.shape[0] >= self.num_seqs
-        fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
-                                                                    num_seqs]
-
-        # Assert unsupported
-        assert fmi.lora_mapping is None
-        assert fmi.lora_requests is not None
-        assert len(fmi.lora_requests) == 0
-        assert fmi.attn_metadata is not None
-        assert fmi.multi_modal_kwargs is not None
-        assert len(fmi.multi_modal_kwargs) == 0
-
-        self.frozen_model_input = dataclasses.replace(
-            self.frozen_model_input,
-            input_tokens=fmi_new_input_tokens,
-            input_positions=fmi_new_input_positions)
-
-        self.maybe_advance_sampling_metadata(device, pin_memory)
-
-
-# MutableModelInputForGPUWithMultiStepMetadata is not subclass of
-# ModelInputForGPU but it wraps the actual input dataclass and adds multi-step
-# metadata
-# mypy: disable-error-code=type-var
-class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
-    # mypy: enable-error-code=type-var
-
-    def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
-
-        super().__init__(*args, **kwargs)
-
-        # Check attention backend support.
-        supported_attention_backends: List[str] = \
-            _get_supported_attention_backends(
-                self.scheduler_config.chunked_prefill_enabled)
-        if self.attn_backend.get_name() not in supported_attention_backends:
-            ms_config_str: str = "Multi-Step + Chunked-Prefill" \
-                if self.scheduler_config.chunked_prefill_enabled \
-                      else "Multi-Step"
-            raise ValueError(
-                f"{ms_config_str} not supported for attention backend: "
-                f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND "
-                f"to a value from {supported_attention_backends}.")
-
-        # uses the base model runner to execute the model and wraps it with
-        # multi-step logic
-        self._base_model_runner: GPUModelRunnerBase = base_model_runner
-
-        self.is_multi_step = self.scheduler_config.is_multi_step
-        self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
-
-        # Using the PythonizationCache in Pipeline-Parallel clobbers the
-        # SequenceOutput and CompletionSequenceGroupOutput object.
-        # When cache-reset happens at the last step of a multi-step
-        # execution, there may be other on-going single-step/multi-step
-        # executions. The current caching implementation does not check
-        # for this.
-        self.pythonization_cache = PythonizationCache() \
-            if self.parallel_config.pipeline_parallel_size == 1 else None
-
-    @functools.cached_property
-    def _copy_stream(self):
-        # used to copy tensors from GPU to CPU asynchronously
-        return torch.cuda.Stream()
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
-        model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        ))
-        return model_input
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> StatefulModelInput:
-        frozen_model_input: ModelInputForGPUWithSamplingMetadata = \
-              self._base_model_runner.prepare_model_input(
-                    seq_group_metadata_list,
-                    virtual_engine,
-                    finished_requests_ids)
-
-        assert frozen_model_input.query_lens is not None
-        assert frozen_model_input.seq_lens is not None
-        assert frozen_model_input.attn_metadata is not None
-        num_queries = len(frozen_model_input.query_lens)
-        num_seqs = len(frozen_model_input.seq_lens)
-        num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills
-
-        model_input = StatefulModelInput(
-            frozen_model_input=frozen_model_input,
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            num_single_step_prefills=num_single_step_prefills)
-
-        return model_input
-
-    def _async_process_outputs(self, model_input: StatefulModelInput,
-                               output_proc_callback: Callable):
-        # Proceed with pythonization and output_proc in order.
-        # Stop on the first one that fails to pythonize
-        output_proc_callback()
-
-        cont = True
-        for step_num, model_output in enumerate(model_input.cached_outputs):
-            if not model_output.pythonized:
-                model_output.maybe_pythonize(model_input, self._copy_stream,
-                                             self.pinned_sampled_token_ids)
-                if model_output.pythonized:
-                    ctx = output_proc_callback.keywords["ctx"]
-                    ctx.append_output(
-                        outputs=[model_output.sampler_output],
-                        seq_group_metadata_list=ctx.seq_group_metadata_list,
-                        scheduler_outputs=ctx.scheduler_outputs,
-                        is_async=False,
-                        is_last_step=False,
-                        is_first_step_output=step_num == 0)
-
-                    output_proc_callback()
-                else:
-                    cont = False
-
-            if not cont:
-                break
-
-    def _final_process_outputs(
-            self, model_input: StatefulModelInput,
-            output_proc_callback: Optional[Callable]) -> List[SamplerOutput]:
-        assert model_input.frozen_model_input is not None
-
-        has_async_callback = output_proc_callback is not None
-
-        outputs = []
-        for step_num, output in enumerate(model_input.cached_outputs):
-            is_last_step = step_num == len(model_input.cached_outputs) - 1
-
-            # For non-async case:
-            #   -- We simply add the outputs
-            # For async case:
-            #   -- Invoke callback, pythonize, add to callback queue and repeat
-            #   -- For last output, just add to callback queue
-            if has_async_callback:
-                assert output_proc_callback is not None
-
-                # Invoke callback before pythonize (to overlap with GPU)
-                output_proc_callback()
-
-                # Pythonize
-                if not output.pythonized:
-                    output.pythonize(model_input, self._copy_stream,
-                                     self.pinned_sampled_token_ids)
-
-                    # For non last step, add to callback queue to chain
-                    # callbacks=>pythonize pairs (for GPU overlap)
-                    if not is_last_step:
-                        ctx = output_proc_callback.keywords[  # type: ignore
-                            "ctx"]  # type: ignore
-                        ctx.append_output(
-                            outputs=[output.sampler_output],
-                            seq_group_metadata_list=ctx.
-                            seq_group_metadata_list,
-                            scheduler_outputs=ctx.scheduler_outputs,
-                            is_async=False,
-                            is_last_step=False,
-                            is_first_step_output=step_num == 0)
-                    else:
-                        outputs.append(output.sampler_output)
-            else:
-                output.pythonize(model_input, self._copy_stream,
-                                 self.pinned_sampled_token_ids)
-                outputs.append(output.sampler_output)
-
-        return outputs
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: StatefulModelInput,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-        """ 
-        Execute the model for a single step and update multi-step
-        metadata
-        """
-        assert num_steps == 1, "MultiStepModelRunner only supports num_steps=1"
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-
-        # path for warm up runs
-        if not model_input.is_multi_step:
-            return self._base_model_runner.execute_model(
-                frozen_model_input, None, intermediate_tensors, num_steps)
-
-        # make sure we skip the sampler on the lask rank and only pythonize
-        # if CPU is ahead.
-        if self.is_driver_worker and get_pp_group().is_last_rank:
-            if self.pinned_sampled_token_ids is None:
-                self.pinned_sampled_token_ids = torch.zeros(
-                    (self.scheduler_config.max_num_seqs, 1),
-                    dtype=torch.long,
-                    device="cpu",
-                    pin_memory=True)
-
-            self._base_model_runner.sampler.include_gpu_probs_tensor = True
-            if frozen_model_input.sampling_metadata:
-                frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
-                    True)
-
-        # some pre-execute model logic for multi-step:
-        #   - if it's the first step, we need to reset the sampling tensors
-        #   - if it's not the first step, we need to advance the step using the
-        #   appended sampler output from last iteration
-        #   - also maybe pythonize if CPU is ahead of GPU
-
-        stream = current_stream()
-        if not model_input.is_first_multi_step:
-            # Explicitly block on the previous step's forward to make sure we
-            # don't clobber any GPU tensors still in use.
-            # This is not needed for flashattn backend, but for other attn
-            # backends such as flashinfer that performs extra CPU operations on
-            # input metadata we may need to synchronize any CPU operations that
-            # might clobber enqueued forwards. (prevents CPU from running too
-            # far ahead if needed)
-            model_input.wait_previous_step()
-            model_input = self._advance_step(
-                model_input, model_input.cached_outputs[-1].sampler_output)
-
-            # frozen_model_input may have been updated
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-
-        if model_input.base_output_proc_callback is None:
-            assert frozen_model_input is not None
-            model_input.base_output_proc_callback = \
-                        frozen_model_input.async_callback
-
-        if frozen_model_input.async_callback is not None:
-            assert model_input.base_output_proc_callback is not None
-            async_callback = functools.partial(
-                self._async_process_outputs,
-                model_input=model_input,
-                output_proc_callback=model_input.base_output_proc_callback)
-
-            model_input.frozen_model_input = dataclasses.replace(  # type: ignore
-                model_input.frozen_model_input,
-                async_callback=async_callback)
-            # Update the local instance
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-
-        # Execute the model
-        output = self._base_model_runner.execute_model(frozen_model_input,
-                                                       None,
-                                                       intermediate_tensors,
-                                                       num_steps=1)
-
-        # record the event for the current step so that the next step can sync
-        model_input.record_step_event(stream)
-
-        if get_pp_group().is_last_rank and self.is_driver_worker:
-            assert isinstance(output, list)
-            assert len(
-                output
-            ) == 1, "MultiStepModelRunner requires single-step base_models"
-
-            # event for the pythonization so that we only pythonize if the
-            # tensors are ready. May be able to be combined with the step event
-            output_ready_event = torch.cuda.Event()
-            output_ready_event.record(stream)
-            if self.parallel_config.pipeline_parallel_size > 1:
-                output[0].sampled_token_ids_cpu = output[
-                    0].sampled_token_ids.cpu()
-            model_input.cached_outputs.append(
-                ModelOutput(output[0], output_ready_event,
-                            output[0].sampled_token_ids, False,
-                            output[0].logprobs, self.pythonization_cache))
-
-            # These GPU tensors are not required by multi-step;
-            # erase them to ensure they are not pythonized or
-            # transferred to CPU
-            output[0].sampled_token_ids = None
-            output[0].sampled_token_probs = None
-            output[0].logprobs = None
-
-            # Pythonize the output if CPU is ahead and the previous step is
-            # ready.
-            if frozen_model_input.async_callback is None:
-                for model_output in model_input.cached_outputs:
-                    model_output.maybe_pythonize(model_input,
-                                                 self._copy_stream,
-                                                 self.pinned_sampled_token_ids)
-
-        model_input.current_step += 1
-
-        if not get_pp_group().is_last_rank:
-            # Should be IntermediateTensors
-            assert isinstance(output, IntermediateTensors)
-            return output
-        if not self.is_driver_worker:
-            return []
-
-        # Pythonize the output and block if needed since it is the last step
-        if model_input.is_last_step:
-            outputs = self._final_process_outputs(
-                model_input, model_input.base_output_proc_callback)
-            if self.pythonization_cache:
-                self.pythonization_cache.reset()
-            return outputs
-
-        # should be [SamplerOutput]
-        return output
-
-    def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata,
-                                  num_seqs: Optional[int], num_queries: int):
-
-        assert sampling_metadata.num_prompts == 0
-        assert len(sampling_metadata.seq_groups) == num_queries
-        assert sampling_metadata.selected_token_indices.shape == (
-            num_queries, )
-        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
-
-        # Verify that all sequences are decodes
-        for i in range(num_queries):
-            seq_group = sampling_metadata.seq_groups[i]
-
-            assert seq_group.is_prompt is False  # No prompt
-            assert seq_group.prompt_logprob_indices == []  # No prompt
-            assert seq_group.sample_indices == [i]  # Simple
-            assert seq_group.seq_len is None  # Decode
-            assert seq_group.query_len is None  # Decode
-
-    def _advance_step(self, model_input: StatefulModelInput,
-                      out: SamplerOutput) -> StatefulModelInput:
-
-        model_input.maybe_advance_frozen_model_input(self.device,
-                                                     self.pin_memory)
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        assert frozen_model_input.input_tokens is not None
-        assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs
-        assert frozen_model_input.attn_metadata is not None
-
-        sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids
-        num_seqs = model_input.num_seqs
-        num_queries = model_input.num_queries
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        attn_metadata = frozen_model_input.attn_metadata
-        assert attn_metadata is not None
-
-        turn_prefills_into_decodes: bool = model_input.current_step == 1 and \
-                                    model_input.num_single_step_prefills != 0
-        attn_metadata.advance_step(
-            frozen_model_input,
-            sampled_token_ids,
-            self.block_size,
-            num_seqs,
-            num_queries,
-            turn_prefills_into_decodes=turn_prefills_into_decodes)
-
-        return model_input
-
-    def load_model(self) -> None:
-        self._base_model_runner.load_model()
-        self.model_memory_usage = self._base_model_runner.model_memory_usage
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        return self._base_model_runner.save_sharded_state(
-            path, pattern, max_size)
-
-    def save_tensorized_model(self,
-                              tensorizer_config: TensorizerConfig) -> None:
-        return self._base_model_runner.save_tensorized_model(tensorizer_config)
-
-    def profile_run(self) -> None:
-        return self._base_model_runner.profile_run()
-
-    def remove_all_loras(self):
-        return self._base_model_runner.remove_all_loras()
-
-    def capture_model(self, kv_caches: List[List]) -> None:
-        return self._base_model_runner.capture_model(kv_caches)
-
-    @property
-    def vocab_size(self) -> int:
-        return self._base_model_runner.vocab_size
-
-
-DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]],
-                                   Optional[List[SampleLogprobs]]]
-
-
-def deferred_pythonize_logprobs(
-    output: SamplerOutput,
-    sampling_metadata: SamplingMetadata,
-    logprobs_tensor: Optional[torch.Tensor],
-) -> DeferredLogprobsReturnType:
-    """Perform deferred logprob Pythonization.
-
-    1. Pythonize GPU-side sampler result tensors into CPU-side sampler result.
-    2. Pythonize GPU-side logprobs tensor into CPU-side logprobs lists,
-       utilizing  the Pythonized sampler result computed in step 1.
-    
-    These deferred computations are not required for single-step scheduling
-    or the `profile_run()` phase of multi-step scheduling.
-
-    Args:
-        output: sampler output (under deferred Pythonization)
-        sampling_metadata
-        
-    Returns:
-        prompt_logprobs (CPU), sample_logprobs (CPU)
-    """
-
-    # - Deferred pythonization of sample result
-    sampler_result = get_pythonized_sample_results(
-        output.deferred_sample_results_args)
-
-    # - Erase the GPU-side deferred sample_result
-    #   computation args to ensure it is never
-    #   pythonized or transferred to CPU
-    output.deferred_sample_results_args = None
-
-    # - Deferred pythonization of logprobs
-    (
-        prompt_logprobs,
-        sample_logprobs,
-    ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result)
-    assert len(prompt_logprobs) == len(sampling_metadata.seq_groups)
-    assert len(sample_logprobs) == len(sampling_metadata.seq_groups)
-
-    return prompt_logprobs, sample_logprobs
-
-
-def _pythonize_sampler_output(
-    model_input: StatefulModelInput,
-    output: SamplerOutput,
-    pinned_sampled_token_buffer: torch.Tensor,
-    sampled_token_ids: torch.Tensor,
-    logprobs_tensor: Optional[torch.Tensor],
-    cache: Optional[PythonizationCache],
-) -> None:
-    """ This function is only called when the output tensors are ready.
-    See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
-
-    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
-    adding a Pythonized output data structure
-    ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
-    for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
-
-    Args:
-      model_input
-      output: sampler output
-      pinned_sampled_token_token_buffer: CPU-side pinned memory
-                                         (receives copy of
-                                         GPU-side token buffer.)
-      sampled_token_ids: GPU-side token buffer
-      logprobs_tensor: GPU-side tensor containing 
-                       logprobs computed during sampling
-    """
-
-    assert model_input.frozen_model_input is not None
-
-    frozen_model_input = model_input.frozen_model_input
-    assert frozen_model_input.sampling_metadata is not None
-    sampling_metadata = frozen_model_input.sampling_metadata
-    # samples generation should have been skipped
-    assert not output.outputs
-
-    pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries]
-
-    # We guarantee output tensors are ready, so it is safe to
-    # pythonize the sampler output & obtain CPU-side logprobs.
-    #
-    # However we should check whether logprobs pythonization may
-    # be skipped entirely, i.e. because no logprobs were requested
-    # or pythonization was not deferred. To that end,
-    #
-    # * `prompt_logprobs_are_requested_for_prefill` signals that
-    #   there are *any* prefill-phase requests which specify that
-    #   prompt logprobs should be returned.
-    #
-    # * `any_logprobs_are_requested` signals that there are any
-    #   requests which (1) specify that sample logprobs should be
-    #   returned, or (2) are in the prefill phase AND specify that
-    #   prompt logprobs should be returned.
-    #
-    # Later on, these flags cause adjustments to the pythonization
-    # process to accommodate logprobs.
-
-    seq_groups = sampling_metadata.seq_groups
-    prompt_logprobs_are_requested_for_prefill = any([
-        sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
-        for sg in seq_groups
-    ])
-    any_logprobs_are_requested = (
-        prompt_logprobs_are_requested_for_prefill
-        or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
-
-    if prompt_logprobs_are_requested_for_prefill:
-        # CPU GPU sync, after gathering *only* sampled tokens (since
-        # requesting prompt logprobs leads `sampled_token_ids` to
-        # include prompt token ids in addition to sampled token ids.)
-        sample_idx_tensor = torch.tensor(
-            [sdx for sg in seq_groups for sdx in sg.sample_indices])
-        pinned_buffer = pinned_buffer.copy_(
-            sampled_token_ids[sample_idx_tensor, :], non_blocking=False)
-    else:
-        # CPU GPU sync
-        pinned_buffer = pinned_buffer.copy_(sampled_token_ids,
-                                            non_blocking=False)
-
-    # this will not block as the tensors are already on CPU
-    samples_list = pinned_buffer.tolist()
-
-    skip_sampler_cpu_output = (
-        frozen_model_input.sampling_metadata.skip_sampler_cpu_output)
-
-    # *Don't* skip logprobs pythonization *if*:
-    # * Any requests require logprobs to be returned in this
-    # iteration AND
-    # * These requests are being scheduled in a fashion which
-    # defers pythonization (i.e. multi-step scheduling.)
-    do_pythonize_logprobs = (skip_sampler_cpu_output
-                             and any_logprobs_are_requested)
-    (
-        prompt_logprobs,
-        sample_logprobs,
-    ) = (deferred_pythonize_logprobs(output, sampling_metadata,
-                                     logprobs_tensor)
-         if do_pythonize_logprobs else (None, None))
-
-    for sgdx, (seq_group,
-               sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        # (Check for Guided Decoding)
-        if seq_group.sampling_params.logits_processors:
-            assert len(seq_group.sampling_params.logits_processors) == 0, (
-                "Logits Processors are not supported in multi-step decoding")
-
-        if do_pythonize_logprobs:
-            assert prompt_logprobs is not None
-            assert sample_logprobs is not None
-
-            (
-                group_prompt_logprobs,
-                group_sample_logprobs,
-            ) = (  # Utilize deferred pythonization results
-                prompt_logprobs[sgdx],
-                sample_logprobs[sgdx],
-            )
-        elif any_logprobs_are_requested:
-            (
-                group_prompt_logprobs,
-                group_sample_logprobs,
-            ) = (
-                # profile_run: use already-computed logprobs
-                output.outputs[sgdx].prompt_logprobs,
-                [sample.logprobs for sample in output.outputs[sgdx].samples])
-
-        seq_ids = seq_group.seq_ids
-        next_token_ids = sample_result
-        parent_ids = [0]
-        seq_outputs: List[SequenceOutput]
-
-        if cache is not None:
-            completion_seq_group_output: CompletionSequenceGroupOutput = \
-                cache.cached_completion_seq_group_output.get_object()
-            completion_seq_group_output.samples.clear()
-            seq_outputs = completion_seq_group_output.samples
-        else:
-            seq_outputs = []
-
-        for tdx, (parent_id,
-                  next_token_id) in enumerate(zip(parent_ids, next_token_ids)):
-            if cache is not None:
-                seq_output: SequenceOutput = cache.cached_seq_output.get_object(
-                )
-                seq_output.parent_seq_id = seq_ids[parent_id]
-                seq_output.output_token = next_token_id
-
-                if any_logprobs_are_requested:
-                    seq_output.logprobs = group_sample_logprobs[tdx]
-                else:
-                    logprobs = next(iter(seq_output.logprobs.values()))
-                    seq_output.logprobs.clear()
-
-                    logprobs.logprob = float('inf')
-                    logprobs.rank = None
-                    logprobs.decoded_token = None
-
-                    seq_output.logprobs[next_token_id] = logprobs
-
-                seq_outputs.append(seq_output)
-
-            else:
-                seq_outputs.append(
-                    SequenceOutput(seq_ids[parent_id], next_token_id,
-                                   (group_sample_logprobs[tdx]
-                                    if any_logprobs_are_requested else {
-                                        next_token_id:
-                                        Logprob(logprob=float('inf'),
-                                                rank=None,
-                                                decoded_token=None)
-                                    })))
-        if cache is not None:
-            completion_seq_group_output.prompt_logprobs = \
-                group_prompt_logprobs if any_logprobs_are_requested else None
-            output.outputs.append(completion_seq_group_output)
-        else:
-            output.outputs.append(
-                CompletionSequenceGroupOutput(
-                    seq_outputs, (group_prompt_logprobs
-                                  if any_logprobs_are_requested else None)))
-
-    assert len(output.outputs) > 0
diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py
deleted file mode 100644
index 25f588077cb4..000000000000
--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from importlib.util import find_spec
-from typing import List, Optional
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors
-from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
-                                             NeuronModelRunner)
-
-
-class MultiStepNeuronModelRunner(NeuronModelRunner):
-    """A model runner for multi step decoding using the transformers_neuronx
-    framework"""
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-        self.speculation_config = self.speculative_config
-        from transformers_neuronx.config import GenerationConfig
-        self.speculation_config.draft_model_config.neuron_sampling_params = (
-            GenerationConfig(
-            max_length=self.scheduler_config.max_model_len,
-            do_sample=True,
-            per_batch_line=True,
-            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
-                  * self.scheduler_config.max_num_seqs,
-            top_p=[1.0] * self.scheduler_config.max_num_seqs,
-            temperature=[1.0] * self.scheduler_config.max_num_seqs,
-            dynamic=True,
-            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K
-        ))
-
-    def load_model(self) -> None:
-        if find_spec("transformers_neuronx") is not None:
-            from vllm.model_executor.model_loader.neuron import (
-                get_neuron_eagle_speculation_model,
-                get_neuron_speculation_model)
-            if self.speculation_config.speculative_token_tree is not None:
-                self.model = get_neuron_eagle_speculation_model(
-                    self.model_config,
-                    parallel_config=self.parallel_config,
-                    scheduler_config=self.scheduler_config,
-                    speculation_config=self.speculation_config)
-            else:
-                self.model = get_neuron_speculation_model(
-                    self.model_config,
-                    parallel_config=self.parallel_config,
-                    scheduler_config=self.scheduler_config,
-                    speculation_config=self.speculation_config)
-        else:
-            raise NotImplementedError(
-                "Supports only Transformer-NeuronX based models.")
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        logits = self.model(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            input_block_ids=model_input.input_block_ids,
-            **MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs or {},
-                device=self.device,
-            ),
-        )
-
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return output
diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
deleted file mode 100644
index dd521dd67dad..000000000000
--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import List, Optional
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors
-from vllm.worker.neuronx_distributed_model_runner import (
-    NeuronxDistributedModelRunner)
-
-
-class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
-    """A model runner for multi-step decoding using the
-    neuronx-distributed-inference framework"""
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-
-    def load_model(self) -> None:
-        from vllm.model_executor.model_loader.neuronx_distributed import (
-            get_neuron_speculation_model)
-        self.model = get_neuron_speculation_model(
-            self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            speculation_config=self.speculative_config)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        sampling_params = torch.tensor([[
-            seq_group.sampling_params.top_k,
-            seq_group.sampling_params.top_p,
-            seq_group.sampling_params.temperature,
-        ] for seq_group in model_input.sampling_metadata.seq_groups])
-
-        logits = self.model(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            input_block_ids=model_input.input_block_ids,
-            sampling_params=sampling_params,
-            **MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs or {},
-                device=self.device,
-            ),
-        )
-
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return output
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
deleted file mode 100644
index ea16e14f9ecd..000000000000
--- a/vllm/worker/multi_step_worker.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import torch
-
-from vllm.distributed import broadcast_tensor_dict, get_pp_group
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.model_runner_base import BroadcastableModelInput
-from vllm.worker.multi_step_model_runner import (MultiStepModelRunner,
-                                                 StatefulModelInput)
-from vllm.worker.worker import Worker, WorkerInput
-
-
-@dataclass
-class MultiStepState:
-    worker_input: WorkerInput
-    model_input: StatefulModelInput
-
-
-class MultiStepWorker(Worker):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        base_model_runner = self.model_runner
-        # for multi-step model, wrap the model runner with MultiStepModelRunner
-        self.model_runner = MultiStepModelRunner(
-            base_model_runner,
-            vllm_config=base_model_runner.vllm_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=base_model_runner.is_driver_worker,
-        )
-
-        pipeline_parallel_size = self.parallel_config.pipeline_parallel_size
-        self.multi_step_states: List[
-            Optional[MultiStepState]] = [None] * pipeline_parallel_size
-        self.temp_output = None
-
-    def _get_driver_input_and_broadcast(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
-        """
-        Get the driver input and broadcast it to other workers.
-        """
-        assert self.is_driver_worker
-        virtual_engine = execute_model_req.virtual_engine
-        is_first_multi_step = execute_model_req.is_first_multi_step
-        if is_first_multi_step:
-            # on first step we prepare the worker input and model input normally
-            worker_input: WorkerInput = self.prepare_worker_input(
-                execute_model_req=execute_model_req)
-            model_input: StatefulModelInput = (
-                self.model_runner.prepare_model_input(
-                    execute_model_req.seq_group_metadata_list,
-                    execute_model_req.virtual_engine,
-                    execute_model_req.finished_requests_ids))
-
-            if execute_model_req.async_callback:
-                model_input.frozen_model_input = dataclasses.replace(  # type: ignore
-                    model_input.frozen_model_input,
-                    async_callback=execute_model_req.async_callback)
-        else:
-            # on subsequent steps we reuse the worker input and model input
-            multi_step_state = self.multi_step_states[virtual_engine]
-            worker_input = multi_step_state.worker_input
-            model_input = multi_step_state.model_input
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-            assert frozen_model_input.attn_metadata is not None
-            # clear the cached metadata so that it can be recomputed on
-            # the workers.
-            frozen_model_input.attn_metadata._cached_prefill_metadata = None
-            frozen_model_input.attn_metadata._cached_decode_metadata = None
-
-        model_input.is_first_multi_step = is_first_multi_step
-        model_input.is_last_step = execute_model_req.is_last_step
-
-        if not is_first_multi_step:
-            # we broadcast the last sampled token ids to all TP workers so they
-            # can update their model input metadata in-place.
-            self._prepare_last_sampled_token_ids_for_tp_workers(
-                execute_model_req=execute_model_req, model_input=model_input)
-
-        if self.do_metadata_broadcast:
-            broadcast_data = worker_input.as_broadcastable_tensor_dict()
-            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
-            broadcast_tensor_dict(broadcast_data, src=0)
-
-        # Retuning empty dict here to keep this compatible with
-        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
-        return model_input, worker_input, {}
-
-    def _prepare_last_sampled_token_ids_for_tp_workers(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        model_input: StatefulModelInput,
-    ) -> None:
-        """ 
-        Prepare the last sampled token ids for TP workers. If it's the last 
-        PP rank, then the last sampled token ids are already in the model_input.
-        If it is NOT the last PP rank, then we need to get the last sampled
-        token that is cached in the execute_model_req.
-        """
-        if get_pp_group().is_last_rank:
-            assert model_input.cached_outputs[
-                -1].sampler_output.sampled_token_ids is None
-            assert model_input.cached_outputs[-1].sampled_token_ids is not None
-            model_input.last_sampled_token_ids = model_input.cached_outputs[
-                -1].sampled_token_ids
-            # free sampled token ids from the previous step if it has been
-            # pythonized. Cannot free the last sampled token ids because
-            # we need it for GPU advance_step.
-            for output in model_input.cached_outputs[:-1]:
-                if output.pythonized:
-                    output.sampled_token_ids = None
-        else:
-            # otherwise we need to get the cached sampled token ids from the
-            # execute_model_req
-            assert execute_model_req.last_sampled_token_ids is not None
-            model_input.last_sampled_token_ids = (
-                execute_model_req.last_sampled_token_ids.cuda())
-            model_input.add_sampler_output(
-                SamplerOutput(outputs=[], sampled_token_ids=None),
-                model_input.last_sampled_token_ids)
-
-            # free sampled token ids from the previous step.
-            # TODO(will) we could reuse the sampled token ids tensor from
-            # the previous step instead.
-            for output in model_input.cached_outputs[:-1]:
-                output.sampled_token_ids = None
-            assert model_input.cached_outputs[-1].sampled_token_ids is not None
-
-    def prepare_input(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str,
-                                                              torch.Tensor]]]:
-        """
-        Depending on the current state of the request and multi step worker,
-        this method may skip the normal _prepare_model_input and
-        _prepare_worker_input methods and instead used cached values.
-        """
-        if self.is_driver_worker:
-            if execute_model_req is None:
-                if self.do_metadata_broadcast:
-                    # This signals that there's no more requests to process for
-                    # now. All workers are running infinite loop with
-                    # broadcast_tensor_dict, and it stops the loop when the
-                    # driver broadcasts an empty input. Send an empty input to
-                    # notify all other workers to stop their execution loop.
-                    broadcast_tensor_dict({}, src=0)
-                return None
-
-            virtual_engine = execute_model_req.virtual_engine
-            (model_input, worker_input,
-             kwargs) = self._get_driver_input_and_broadcast(execute_model_req)
-            assert isinstance(model_input, StatefulModelInput)
-            if execute_model_req.is_first_multi_step:
-                # cache the worker input and model input for the next steps
-                self.multi_step_states[virtual_engine] = MultiStepState(
-                    worker_input=worker_input, model_input=model_input)
-        # if TP workers
-        else:
-            broadcast_data = self._get_worker_input_from_broadcast()
-            # if the driver has sent an empty input, we should stop the worker
-            # loop
-            if broadcast_data is None:
-                return None
-            model_input, worker_input, kwargs = broadcast_data
-            assert isinstance(model_input, StatefulModelInput)
-            virtual_engine = worker_input.virtual_engine
-            if model_input.is_first_multi_step:
-                pass
-                # TODO(will) Can cache the worker input and model input for the
-                # next steps. See below for details
-            else:
-                # TODO(will) possible to also cache and reuse the cached worker
-                # input and model input. The idea is essentially the delta
-                # optimization for model_inputs. Where the TP workers can cache
-                # the model input states and we only broadcast the delta need
-                # for the next step (sampled_token_ids from the previous step)
-
-                assert isinstance(model_input, StatefulModelInput)
-                # we need to update the last sampled token ids in the model
-                # input for the workers so that they can run inplace
-                # advance_step
-                model_input.add_sampler_output(
-                    SamplerOutput(outputs=[], sampled_token_ids=None),
-                    model_input.last_sampled_token_ids)
-
-        assert model_input is not None
-        assert worker_input is not None
-        return model_input, worker_input, kwargs
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 4e1408300fb8..3e4512a63908 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -64,25 +64,21 @@ def get_tnx_model_runner(self, vllm_config):
         assert (self.lora_config
                 is None), ("LoRA is not supported for TransformersNeuronX "
                            "framework.")
-        from vllm.worker.multi_step_neuron_model_runner import (
-            MultiStepNeuronModelRunner)
         if self.speculative_config is not None:
-            return MultiStepNeuronModelRunner(vllm_config=vllm_config)
-        else:
-            return NeuronModelRunner(vllm_config=vllm_config)
+            raise NotImplementedError(
+                "Speculative decoding is not supported for TransformersNeuronX"
+            )
+        return NeuronModelRunner(vllm_config=vllm_config)
 
     def get_neuronx_distributed_model_runner(self, vllm_config):
-        from vllm.worker.multi_step_neuronx_distributed_model_runner import (
-            MultiStepNeuronxDistributedModelRunner)
         from vllm.worker.neuronx_distributed_model_runner import (
             NeuronxDistributedModelRunner)
         if self.speculative_config is not None:
-            assert (self.lora_config
-                    is None), "LoRA is not supported for Speculative Decoding"
-            return MultiStepNeuronxDistributedModelRunner(
-                vllm_config=vllm_config)
-        else:
-            return NeuronxDistributedModelRunner(vllm_config=vllm_config)
+            assert (self.lora_config is None), (
+                "LoRA is not supported for Speculative Decoding")
+            raise NotImplementedError(
+                "Speculative decoding is not supported for NeuronxDistributed")
+        return NeuronxDistributedModelRunner(vllm_config=vllm_config)
 
     def init_device(self) -> None:
         self.init_distributed_environment()

From 1187e5062a2653112b20d56edf3729520c99673f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Aug 2025 20:21:18 -0700
Subject: [PATCH 018/233] [Misc] Remove tests/multi_step/__init__.py (#22778)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 tests/multi_step/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/multi_step/__init__.py

diff --git a/tests/multi_step/__init__.py b/tests/multi_step/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000

From 753f655d6ed0facf3cbda7b703d021b3dce30322 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Aug 2025 20:38:18 -0700
Subject: [PATCH 019/233] [V0 Deprecation] Remove args for multi-step
 scheduling (#22779)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
---
 tests/utils_/test_utils.py |  1 -
 vllm/config/scheduler.py   | 27 +--------------------------
 2 files changed, 1 insertion(+), 27 deletions(-)

diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 8be1e103dc65..084d82dee11b 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -161,7 +161,6 @@ def parser_with_config():
     parser.add_argument('--port', type=int)
     parser.add_argument('--tensor-parallel-size', type=int)
     parser.add_argument('--trust-remote-code', action='store_true')
-    parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean)
     return parser
 
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index db669600a0cc..93002012799a 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -115,12 +115,6 @@ class SchedulerConfig:
     (e.g., beam search), recomputation is not currently supported. In
     such a case, we use swapping instead."""
 
-    num_scheduler_steps: int = 1
-    """Maximum number of forward steps per scheduler call."""
-
-    multi_step_stream_outputs: bool = True
-    """If False, then multi-step will stream outputs at the end of all steps"""
-
     send_delta_data: bool = False
     """Private API. If used, scheduler sends delta data to
     workers instead of an entire data. It should be enabled only
@@ -193,16 +187,7 @@ def __post_init__(self) -> None:
 
         if self.max_num_batched_tokens is None:
             if self.enable_chunked_prefill:
-                if self.num_scheduler_steps > 1:
-                    # Multi-step Chunked-Prefill doesn't allow prompt-chunking
-                    # for now. Have max_num_batched_tokens set to max_model_len
-                    # so we don't reject sequences on account of a short
-                    # max_num_batched_tokens.
-                    self.max_num_batched_tokens = max(
-                        self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
-                else:
-                    self.max_num_batched_tokens = (
-                        DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
             else:
                 # If max_model_len is too short, use
                 # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
@@ -293,12 +278,6 @@ def _verify_args(self) -> Self:
                 f"({self.num_lookahead_slots}) must be greater than or "
                 "equal to 0.")
 
-        if self.num_scheduler_steps < 1:
-            raise ValueError(
-                "num_scheduler_steps "
-                f"({self.num_scheduler_steps}) must be greater than or "
-                "equal to 1.")
-
         if self.max_num_partial_prefills < 1:
             raise ValueError(
                 f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
@@ -323,7 +302,3 @@ def _verify_args(self) -> Self:
                 f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
 
         return self
-
-    @property
-    def is_multi_step(self) -> bool:
-        return self.num_scheduler_steps > 1

From 19891dc6771df282a93ad528765d48ce88880a40 Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Wed, 13 Aug 2025 12:21:50 +0800
Subject: [PATCH 020/233] Fix cuda illegal mem access with Llama4 TP8 +
 rms_norm custom op (#22701)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
---
 vllm/model_executor/models/llama4.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 1f8b9d074479..308cb3e85e27 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -224,10 +224,14 @@ def forward(
 
         if self.rotary_emb is not None:
             q, k = self.rotary_emb(positions, q, k)
+
         if self.qk_norm is not None:
-            q = q.reshape(-1, self.num_heads, self.head_dim)
+            # Normalization is applied on the head_dim dimension. The rest of
+            # the dimensions are collapsed into a single dimension to support
+            # custom rms_norm cuda kernel.
+            q = q.reshape(-1, self.head_dim)
             q = self.qk_norm(q.float()).reshape(-1, self.q_size).to(q.dtype)
-            k = k.reshape(-1, self.num_kv_heads, self.head_dim)
+            k = k.reshape(-1, self.head_dim)
             k = self.qk_norm(k.float()).reshape(-1, self.kv_size).to(k.dtype)
 
         # We are applying temperature tuning (https://arxiv.org/abs/2501.19399)

From 61419e90b4a46714874971af1e22a9552d7ffb6d Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 13 Aug 2025 00:22:05 -0400
Subject: [PATCH 021/233] [Bugfix] Fix default enable for CUTLASS MLA on SM100
 (#22738)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/platforms/cuda.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 70959131573f..63f6b373c322 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -152,6 +152,9 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 if cls.is_device_capability(100):
                     # Blackwell => Force CutlassMLA.
                     use_cutlass_mla = True
+                    # TODO: This does not work, because the
+                    # global_force_attn_backend_context_manager is not set.
+                    # See vllm/attention/selector.py:_cached_get_attn_backend
                     envs.VLLM_ATTENTION_BACKEND = "CUTLASS_MLA"
                 else:
                     # Not Blackwell
@@ -217,7 +220,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         if use_mla:
             # TODO(lucas): refactor to be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.CUTLASS_MLA:
+            if selected_backend == _Backend.CUTLASS_MLA or (
+                    cls.is_device_capability(100) and selected_backend is None
+                    and block_size == 128):
                 if use_v1:
                     logger.info_once("Using Cutlass MLA backend on V1 engine.")
                     return ("vllm.v1.attention.backends.mla."

From 61b6648d43949acc1aa0498ff6b285ef20875119 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 13 Aug 2025 00:22:16 -0400
Subject: [PATCH 022/233] Force TRTLLM attention for gpt-oss on SM100 (#22678)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/models/gpt_oss.py    |  5 +----
 vllm/utils/flashinfer.py                 |  8 ++++++++
 vllm/v1/attention/backends/flashinfer.py | 11 +++++++----
 vllm/v1/attention/backends/utils.py      |  5 ++++-
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 6a65bbbe2e0d..7c7712dbe106 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -8,7 +8,6 @@
 from torch import nn
 from transformers import GptOssConfig
 
-from vllm import envs
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -70,11 +69,9 @@ def __init__(
 
         tp_size = get_tensor_model_parallel_world_size()
 
-        attention_sink_dtype = (torch.float32 if envs.VLLM_USE_TRTLLM_ATTENTION
-                                else torch.bfloat16)
         self.sinks = torch.nn.Parameter(
             torch.empty(config.num_attention_heads // tp_size,
-                        dtype=attention_sink_dtype,
+                        dtype=torch.bfloat16,
                         requires_grad=False))
 
         self.norm = RMSNorm(config.hidden_size, eps=1e-5)
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 5998d4c3127f..6b23ed426806 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -154,6 +154,7 @@ def use_trtllm_attention(
     num_qo_heads: Optional[int],
     num_kv_heads: Optional[int],
     attn_head_size: Optional[int],
+    has_sinks: bool = False,
 ) -> bool:
     # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
     if not (current_platform.is_device_capability(100)
@@ -165,6 +166,13 @@ def use_trtllm_attention(
             or num_qo_heads % num_kv_heads != 0):
         return False
 
+    # If sinks are being used, we must use TRTLLM attention as it's
+    # the only backend that supports them
+    if has_sinks:
+        logger.info_once(
+            "Using TRTLLM attention (required for attention sinks).")
+        return True
+
     env_value = envs.VLLM_USE_TRTLLM_ATTENTION
     if env_value is not None:
         logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index c85d8bce31f5..12e5542d691c 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -523,14 +523,17 @@ def build(self,
         num_kv_heads = self.kv_cache_spec.num_kv_heads
         head_dim = self.kv_cache_spec.head_size
 
+        # Check if any layer uses sinks (requires TRTLLM attention)
+        has_sinks = self.global_hyperparameters.has_sinks
+
         # currently prefill trtllm attention does not support fp8 kv cache
         prefill_use_trtllm = not cache_dtype.startswith("fp8") \
                                 and use_trtllm_attention(
                                 num_prefill_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim)
+                                num_qo_heads, num_kv_heads, head_dim, has_sinks)
         decode_use_trtllm = use_trtllm_attention(
                                 num_decode_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim)
+                                num_qo_heads, num_kv_heads, head_dim, has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -642,9 +645,9 @@ def __init__(
                     f"heads in the layer. Expected {num_heads}, but got "
                     f"{sinks.shape[0]}."
                 )
+            # Cast sinks to float32 if needed (FlashInfer requirement)
             if sinks.dtype != torch.float32:
-                raise ValueError("Sinks must be of type float32, but got "
-                                 f"{sinks.dtype}.")
+                sinks = sinks.to(torch.float32)
             self.sinks = sinks
 
     def forward(
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index e23dd8bc5bbb..91eb84245ac0 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -285,6 +285,7 @@ class PerLayerParameters:
     window_left: int
     logits_soft_cap: Optional[float]
     sm_scale: float
+    has_sinks: bool = False
 
 
 def get_per_layer_parameters(
@@ -307,9 +308,11 @@ def get_per_layer_parameters(
         window_left = window_size[0] if window_size is not None else -1
         logits_soft_cap = getattr(impl, "logits_soft_cap", None)
         sm_scale = impl.scale
+        has_sinks = getattr(impl, "sinks", None) is not None
 
         per_layer_params[key] = PerLayerParameters(window_left,
-                                                   logits_soft_cap, sm_scale)
+                                                   logits_soft_cap, sm_scale,
+                                                   has_sinks)
 
     return per_layer_params
 

From f776e11f500282b349e166b939891fd3d7d1fa29 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 13 Aug 2025 00:26:38 -0400
Subject: [PATCH 023/233] Remove unneeded ROCm platform import when using CUDA
 (#22765)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/attention/backends/rocm_flash_attn.py         | 2 +-
 vllm/attention/ops/chunked_prefill_paged_decode.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 1ee1dea729d9..da3d9ff32830 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -22,7 +22,6 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.platforms import current_platform
-from vllm.platforms.rocm import use_rocm_custom_paged_attention
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
@@ -886,6 +885,7 @@ def forward(
             num_seqs, num_heads, head_size = decode_query.shape
             block_size = value_cache.shape[3]
             gqa_ratio = num_heads // self.num_kv_heads
+            from vllm.platforms.rocm import use_rocm_custom_paged_attention
             use_custom = use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
                 decode_meta.max_decode_seq_len, self.sliding_window,
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index dc10d7eca9c2..e5b90a8b2755 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -11,7 +11,6 @@
 
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.platforms.rocm import use_rocm_custom_paged_attention
 from vllm.triton_utils import tl, triton
 
 from .prefix_prefill import context_attention_fwd
@@ -296,6 +295,7 @@ def chunked_prefill_paged_decode(
     num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
                                     16)
 
+    from vllm.platforms.rocm import use_rocm_custom_paged_attention
     use_custom = use_rocm_custom_paged_attention(
         query.dtype,
         head_size,

From 50bd03332d7791319bbea6b8ecf4db8ecf70c14e Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 13 Aug 2025 00:31:47 -0400
Subject: [PATCH 024/233] [Bug] Fix Unexpected Keyword Argument 'w1_bias'
 (#22757)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index fb38fb91ead6..8ef0a805d86c 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -475,12 +475,11 @@ def forward_cuda(
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input)
         else:
-            return self.fused_experts(
+            # add w1_bias/w2_bias to kwargs if they exist
+            kwargs = dict(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
-                w1_bias=layer.w13_bias if self.has_bias else None,
-                w2_bias=layer.w2_bias if self.has_bias else None,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
@@ -489,6 +488,17 @@ def forward_cuda(
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
             )
+            if isinstance(self.fused_experts,
+                          FusedMoEModularKernel) and self.has_bias:
+                raise ValueError(
+                    "FusedMoEModularKernel does not support bias.")
+            if self.has_bias:
+                kwargs.update({
+                    "w1_bias": getattr(layer, "w13_bias", None),
+                    "w2_bias": getattr(layer, "w2_bias", None),
+                })
+
+            return self.fused_experts(**kwargs)
 
     def forward_cpu(
         self,

From cbb55083439048ab4c78039fbaad05ac90e3d2da Mon Sep 17 00:00:00 2001
From: shixianc <49539556+shixianc@users.noreply.github.com>
Date: Tue, 12 Aug 2025 21:34:47 -0700
Subject: [PATCH 025/233] [Perf] Support topk softmax fused kernel for broader
 num_experts (#22211)

Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
---
 csrc/moe/topk_softmax_kernels.cu | 77 +++++++++++++++++++-------------
 tests/kernels/moe/test_moe.py    |  2 +-
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 7a7865b901de..946c137db636 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -188,7 +188,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(
   It fuses the softmax, max and argmax into a single kernel.
 
   Limitations:
-  1) This implementation is intended for when the number of experts is a small power of 2.
+  1) This implementation is optimized for when the number of experts is a small power of 2.
+     Additionally it also supports when number of experts is multiple of 64 which is still
+     faster than the computing softmax and topK separately (only tested on CUDA yet).
   2) This implementation assumes k is small, but will work for any k.
 */
 
@@ -198,8 +200,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
         int* source_rows, const int k, const int start_expert, const int end_expert)
 {
     // We begin by enforcing compile time assertions and setting up compile time constants.
-    static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
-    static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
     static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
     static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
 
@@ -407,12 +407,10 @@ struct TopkConstants
 };
 } // namespace detail
 
-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, typename IndType>
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
 void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
     int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
 {
-    static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
-
     static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
     using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
     static constexpr int VPT = Constants::VPT;
@@ -425,21 +423,12 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
         input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
 }
 
-#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                                \
-    switch (warpSize) {                                                          \
-        case 32:                                                                 \
-            topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32>(      \
-                gating_output, nullptr, topk_weights, topk_indices,              \
-                token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
-            break;                                                               \
-        case 64:                                                                 \
-            topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64>(      \
-                gating_output, nullptr, topk_weights, topk_indices,              \
-                token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
-            break;                                                               \
-        default:                                                                 \
-            TORCH_CHECK(false, "Unsupported warp size: ", warpSize);             \
-    }
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
+    static_assert(WARP_SIZE == 32 || WARP_SIZE == 64,                                 \
+                  "Unsupported warp size. Only 32 and 64 are supported.");            \
+    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
+        gating_output, nullptr, topk_weights, topk_indices,                           \
+        token_expert_indices, num_tokens, topk, 0, num_experts, stream);
 
 template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
@@ -453,38 +442,62 @@ void topkGatingSoftmaxKernelLauncher(
     const int topk,
     cudaStream_t stream) {
     static constexpr int WARPS_PER_TB = 4;
-    auto warpSize = WARP_SIZE;
+    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
+    static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
     switch (num_experts) {
         case 1:
-            LAUNCH_SOFTMAX(1, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 2:
-            LAUNCH_SOFTMAX(2, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 4:
-            LAUNCH_SOFTMAX(4, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 8:
-            LAUNCH_SOFTMAX(8, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 16:
-            LAUNCH_SOFTMAX(16, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 32:
-            LAUNCH_SOFTMAX(32, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 64:
-            LAUNCH_SOFTMAX(64, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 128:
-            LAUNCH_SOFTMAX(128, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
             break;
         case 256:
-            LAUNCH_SOFTMAX(256, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            break;
+        case 512:
+            LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            break;
+        // (CUDA only) support multiples of 64 when num_experts is not power of 2.
+        // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
+        // alternatively we can test 4 bytes loading and enable it in future.
+#ifndef USE_ROCM
+        case 192:
+            LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
             break;
+        case 320:
+            LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 384:
+            LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 448:
+            LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 576:
+            LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+#endif
         default: {
             TORCH_CHECK(softmax_workspace != nullptr,
-                "softmax_workspace must be provided for num_experts that are not a power of 2.");
+                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
             static constexpr int TPB = 256;
             moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
                 gating_output, nullptr, softmax_workspace, num_experts);
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 0f1c78704642..49c097718e30 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -36,7 +36,7 @@
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
-NUM_EXPERTS = [8, 64]
+NUM_EXPERTS = [8, 64, 192]
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
 

From dd5c24607a6edffe634653c5df4607947e25c0dc Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 12 Aug 2025 21:37:26 -0700
Subject: [PATCH 026/233] [gpt-oss] upgrade gpt-oss to v0.0.3 and add version
 check (#22768)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/tool.py | 51 ++++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
index 723cff91d44c..758789a5e059 100644
--- a/vllm/entrypoints/tool.py
+++ b/vllm/entrypoints/tool.py
@@ -2,9 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Optional
-
-from openai_harmony import Message
+from typing import TYPE_CHECKING, Any
 
 from vllm.logger import init_logger
 
@@ -15,6 +13,30 @@
 logger = init_logger(__name__)
 
 
+def validate_gpt_oss_install():
+    """
+    Check if the gpt-oss is installed and its version is at least 0.0.3.
+    If not, raise an ImportError.
+    """
+    from importlib.metadata import PackageNotFoundError, version
+
+    from packaging.version import InvalidVersion, Version
+
+    try:
+        pkg_version_str = version("gpt_oss")  # e.g., "0.0.5"
+        pkg_version = Version(pkg_version_str)
+    except PackageNotFoundError:
+        raise ImportError("Package 'gpt_oss' is not installed.") from None
+    except InvalidVersion as e:
+        raise ImportError(
+            f"Invalid version string for 'gpt_oss': {e}") from None
+
+    if pkg_version < Version("0.0.3"):
+        raise ImportError(
+            f"gpt_oss >= 0.0.3 is required, but {pkg_version} is installed."
+        ) from None
+
+
 class Tool(ABC):
 
     @abstractmethod
@@ -33,12 +55,14 @@ def __init__(self):
             return
 
         try:
+            validate_gpt_oss_install()
             from gpt_oss.tools.simple_browser import SimpleBrowserTool
             from gpt_oss.tools.simple_browser.backend import ExaBackend
-        except ImportError:
+        except ImportError as e:
             self.enabled = False
             logger.warning_once(
-                "gpt_oss is not installed, browsing is disabled")
+                "gpt_oss is not installed properly (%s), browsing is disabled",
+                e)
             return
 
         browser_backend = ExaBackend(source="web", api_key=exa_api_key)
@@ -65,23 +89,16 @@ def __init__(self):
         self.enabled = True
 
         try:
+            validate_gpt_oss_install()
             from gpt_oss.tools.python_docker.docker_tool import PythonTool
-        except ImportError:
+        except ImportError as e:
             self.enabled = False
             logger.warning_once(
-                "gpt_oss is not installed, code interpreter is disabled")
+                "gpt_oss is not installed properly (%s), code interpreter is "
+                "disabled", e)
             return
 
-        # NOTE (Chen): as of gpt-oss 0.0.2, there is a bug in _make_response
-        # and we do the following monkey patch to fix it.
-        class PatchedGptOssPythonTool(PythonTool):
-
-            def _make_response(self,
-                               output: str,
-                               channel: Optional[str] = None) -> Message:
-                return super()._make_response(output)
-
-        self.python_tool = PatchedGptOssPythonTool()
+        self.python_tool = PythonTool()
         logger.info_once("Code interpreter tool initialized")
 
     async def get_result(self, context: "ConversationContext") -> Any:

From f362240ae578344e2dac1ad5cbaf0254991e8ec7 Mon Sep 17 00:00:00 2001
From: zzh142857 <chaorenzhaozhenghao@gmail.com>
Date: Wed, 13 Aug 2025 03:09:13 -0400
Subject: [PATCH 027/233] [Model] Add option to run Step3VisionEncoder in DP
 (#22697)

Signed-off-by: zzh142857 <chaorenzhaozhenghao@gmail.com>
---
 vllm/model_executor/models/step3_vl.py | 132 +++++++++++++++++--------
 1 file changed, 91 insertions(+), 41 deletions(-)

diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 41dba312cb42..f1f38c01b784 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -21,6 +21,7 @@
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -33,6 +34,7 @@
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.utils import run_dp_sharded_vision_model
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import Step3VisionEncoderConfig
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -650,7 +652,8 @@ class Step3VisionAttention(nn.Module):
     def __init__(self,
                  config,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -659,20 +662,42 @@ def __init__(self,
 
         self.scale = self.head_dim**-0.5
 
-        tp_size = get_tensor_model_parallel_world_size()
+        tp_size = (1 if use_data_parallel else
+                   get_tensor_model_parallel_world_size())
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
-        self.qkv_proj = QKVParallelLinear(self.embed_dim,
-                                          self.head_dim,
-                                          self.total_num_heads,
-                                          bias=True,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
-        self.out_proj = RowParallelLinear(self.embed_dim,
-                                          self.embed_dim,
-                                          bias=True,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+
+        self.q_size = self.num_heads * self.head_dim
+
+        if use_data_parallel:
+            self.qkv_proj = ReplicatedLinear(
+                self.embed_dim,
+                3 * self.q_size,
+                bias=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+            self.out_proj = ReplicatedLinear(
+                self.total_num_heads * self.head_dim,
+                self.embed_dim,
+                bias=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+        else:
+            self.qkv_proj = QKVParallelLinear(
+                self.embed_dim,
+                self.head_dim,
+                self.total_num_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+            self.out_proj = RowParallelLinear(self.embed_dim,
+                                              self.embed_dim,
+                                              bias=True,
+                                              quant_config=quant_config,
+                                              prefix=prefix)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads,
@@ -712,20 +737,25 @@ class Step3VisionMLP(nn.Module):
     def __init__(self,
                  config,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(config.hidden_size,
-                                        config.intermediate_size,
-                                        bias=True,
-                                        quant_config=quant_config,
-                                        prefix=prefix)
-        self.fc2 = RowParallelLinear(config.intermediate_size,
-                                     config.hidden_size,
-                                     bias=True,
-                                     quant_config=quant_config,
-                                     prefix=prefix)
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        self.fc1 = cls_fc1(config.hidden_size,
+                           config.intermediate_size,
+                           bias=True,
+                           quant_config=quant_config,
+                           prefix=prefix)
+        cls_fc2 = (ReplicatedLinear
+                   if use_data_parallel else RowParallelLinear)
+        self.fc2 = cls_fc2(config.intermediate_size,
+                           config.hidden_size,
+                           bias=True,
+                           quant_config=quant_config,
+                           prefix=prefix)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc1(hidden_states)
@@ -739,15 +769,22 @@ class Step3VisionEncoderLayer(nn.Module):
     def __init__(self,
                  config: Step3VisionEncoderConfig,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
+        self.use_data_parallel = use_data_parallel
         self.embed_dim = config.hidden_size
-        self.self_attn = Step3VisionAttention(config,
-                                              quant_config,
-                                              prefix=f"{prefix}.self_attn")
+        self.self_attn = Step3VisionAttention(
+            config,
+            quant_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=self.use_data_parallel)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
-        self.mlp = Step3VisionMLP(config, quant_config, prefix=f"{prefix}.mlp")
+        self.mlp = Step3VisionMLP(config,
+                                  quant_config,
+                                  prefix=f"{prefix}.mlp",
+                                  use_data_parallel=self.use_data_parallel)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
 
@@ -767,13 +804,16 @@ class Step3VisionEncoder(nn.Module):
     def __init__(self,
                  config: Step3VisionEncoderConfig,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.config = config
+        self.use_data_parallel = use_data_parallel
         self.layers = nn.ModuleList([
             Step3VisionEncoderLayer(config,
                                     quant_config,
-                                    prefix=f"{prefix}.layers.{i}")
+                                    prefix=f"{prefix}.layers.{i}",
+                                    use_data_parallel=self.use_data_parallel)
             for i in range(config.num_hidden_layers)
         ])
 
@@ -792,21 +832,29 @@ class Step3VisionTransformer(nn.Module):
     def __init__(self,
                  config: Step3VisionEncoderConfig,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
         self.config = config
+        self.use_data_parallel = use_data_parallel
         self.image_size = config.image_size
         self.embeddings = Step3VisionEmbeddings(config)
-        self.transformer = Step3VisionEncoder(config,
-                                              quant_config,
-                                              prefix=f"{prefix}.transformer")
+        self.transformer = Step3VisionEncoder(
+            config,
+            quant_config,
+            prefix=f"{prefix}.transformer",
+            use_data_parallel=self.use_data_parallel)
 
     def forward(
         self,
         pixel_values: torch.Tensor,
     ):
         hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.transformer(inputs_embeds=hidden_states)
+        if self.use_data_parallel:
+            hidden_states = run_dp_sharded_vision_model(
+                hidden_states, self.transformer)
+        else:
+            hidden_states = self.transformer(inputs_embeds=hidden_states)
         return hidden_states
 
 
@@ -836,13 +884,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         self.config = config
         self.multimodal_config = multimodal_config
+        self.use_data_parallel = (vllm_config.parallel_config.
+                                  enable_multimodal_encoder_data_parallel)
 
         if multimodal_config.get_limit_per_prompt("image"):
-            self.vision_model = Step3VisionTransformer(config.vision_config,
-                                                       None,
-                                                       prefix=maybe_prefix(
-                                                           prefix,
-                                                           "vision_model"))
+            self.vision_model = Step3VisionTransformer(
+                config.vision_config,
+                None,
+                prefix=maybe_prefix(prefix, "vision_model"),
+                use_data_parallel=self.use_data_parallel)
             self.vit_downsampler = nn.Conv2d(
                 config.vision_config.hidden_size,
                 config.vision_config.output_hidden_size,

From ee22b087ff1ff6a71d21acb061963369f8e0088e Mon Sep 17 00:00:00 2001
From: Yuxuan Zhang <2448370773@qq.com>
Date: Wed, 13 Aug 2025 16:23:33 +0800
Subject: [PATCH 028/233] [Model] Add missing prefix to glm4_1v (#22716)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
---
 vllm/model_executor/models/glm4_1v.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 2a89c03bfe7e..88c53c836327 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -453,25 +453,30 @@ def __init__(
         context_dim: int,
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = d_model
         self.proj = ColumnParallelLinear(self.hidden_size,
                                          self.hidden_size,
                                          bias=bias,
-                                         gather_output=True)
+                                         gather_output=True,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.proj")
         self.post_projection_norm = nn.LayerNorm(self.hidden_size)
         self.gate_up_proj = MergedColumnParallelLinear(
             input_size=self.hidden_size,
             output_sizes=[context_dim] * 2,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             context_dim,
             self.hidden_size,
             bias=bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         self.act_fn = SiluAndMul()
         self.extra_activation_func = nn.GELU()
@@ -661,6 +666,7 @@ def __init__(
             context_dim=vision_config.intermediate_size,
             quant_config=quant_config,
             bias=False,
+            prefix=f"{prefix}.merger",
         )
         self.embeddings = Glm4vVisionEmbeddings(vision_config)
 

From 66c5b95c15d25afa4b4138f1576816b170ecd327 Mon Sep 17 00:00:00 2001
From: Duc-Viet Hoang <vietyb00@gmail.com>
Date: Wed, 13 Aug 2025 17:11:36 +0700
Subject: [PATCH 029/233] [Bugfix] Fix Nemotron VL image processing (#22739)

Co-authored-by: ducviet00-h2 <viet.d.hoang@h2corporation.jp>
---
 .../multimodal/processing/test_nemotron_vl.py |   8 +-
 vllm/model_executor/models/nemotron_vl.py     | 186 ++++++++++++++++++
 2 files changed, 190 insertions(+), 4 deletions(-)

diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index 3ce88bc427f5..6fbbab0d2612 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -23,15 +23,15 @@ def _get_expected_num_patches(
     min_num: int,
     max_num: int,
 ):
-    from vllm.model_executor.models.internvl import (
-        calculate_internvl_targets, get_internvl_target_ratios)
+    from vllm.model_executor.models.nemotron_vl import (
+        calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
 
     width, height = image.size
 
-    blocks, _, _ = calculate_internvl_targets(
+    blocks, _, _ = calculate_nemotron_vl_targets(
         orig_width=width,
         orig_height=height,
-        target_ratios=get_internvl_target_ratios(
+        target_ratios=get_nemotron_vl_target_ratios(
             min_num,
             max_num,
         ),
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index b90cb9b39a60..82bcd064624f 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -13,6 +13,7 @@
 
 import torch
 import torch.nn as nn
+import torchvision.transforms as T
 from PIL import Image
 from transformers import AutoModel, PretrainedConfig
 from transformers.image_processing_utils_fast import BaseImageProcessorFast
@@ -27,6 +28,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import NestedTensors
 from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
@@ -44,6 +46,146 @@
 IMG_CONTEXT = '<image>'
 
 
+def build_transform(input_size: int):
+    return T.Compose([
+        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+    ])
+
+
+# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_factor = float('-inf')
+    best_ratio = (1, 1)
+    area = width * height
+
+    for rw, rh in target_ratios:
+        target_aspect_ratio = rw / rh
+        size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
+        ratio_closeness = min(target_aspect_ratio / aspect_ratio,
+                              aspect_ratio / target_aspect_ratio)
+        factor = size_factor * ratio_closeness
+
+        if factor > best_factor:
+            best_factor = factor
+            best_ratio = (rw, rh)
+
+    return best_ratio
+
+
+def calculate_nemotron_vl_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_nemotron_vl(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_nemotron_vl_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+def get_nemotron_vl_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def image_to_pixel_values_nemotron_vl(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+
+    images = dynamic_preprocess_nemotron_vl(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
 class NemotronVLProcessor(InternVLProcessor):
 
     def __init__(
@@ -87,6 +229,50 @@ def __init__(
     def image_token_id(self) -> int:
         return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_nemotron_vl_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_nemotron_vl(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            ) for image in images
+        ]
+
     def _preprocess_image(
         self,
         text: list[str],

From 96ddae46c164ec685c68012bd6fb6baf128fce03 Mon Sep 17 00:00:00 2001
From: 633WHU <cliu_whu@yeah.net>
Date: Wed, 13 Aug 2025 19:10:07 +0800
Subject: [PATCH 030/233] [Doc] Add max_lora_rank configuration guide (#22782)

Signed-off-by: chiliu <cliu_whu@yeah.net>
---
 docs/features/lora.md | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/features/lora.md b/docs/features/lora.md
index a4e05dae11c2..668460a368a7 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -351,3 +351,22 @@ vllm serve ibm-granite/granite-speech-3.3-2b \
 ```
 
 Note: Default multimodal LoRAs are currently only available for `.generate` and chat completions.
+
+## Using Tips
+
+### Configuring `max_lora_rank`
+
+The `--max-lora-rank` parameter controls the maximum rank allowed for LoRA adapters. This setting affects memory allocation and performance:
+
+- **Set it to the maximum rank** among all LoRA adapters you plan to use
+- **Avoid setting it too high** - using a value much larger than needed wastes memory and can cause performance issues
+
+For example, if your LoRA adapters have ranks [16, 32, 64], use `--max-lora-rank 64` rather than 256
+
+```bash
+# Good: matches actual maximum rank
+vllm serve model --enable-lora --max-lora-rank 64
+
+# Bad: unnecessarily high, wastes memory
+vllm serve model --enable-lora --max-lora-rank 256
+```

From 24fddcf491309be2a99af303e244bff82f8b7681 Mon Sep 17 00:00:00 2001
From: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com>
Date: Wed, 13 Aug 2025 04:11:28 -0700
Subject: [PATCH 031/233] [V1] Add tree drafting tests for eagle spec decoding
 (#22705)

Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
---
 tests/v1/spec_decode/test_eagle.py      | 160 +++++++++++++++++++++++-
 tests/v1/spec_decode/test_max_len.py    |   6 -
 vllm/v1/attention/backends/tree_attn.py |   6 +-
 vllm/v1/spec_decode/eagle.py            |  61 +++------
 4 files changed, 178 insertions(+), 55 deletions(-)

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 2b4f8bd2a8b9..7b8445a0b287 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Optional
 from unittest import mock
 
 import pytest
@@ -23,7 +24,11 @@
 eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 
 
-def _create_proposer(method: str, k: int) -> EagleProposer:
+def _create_proposer(
+    method: str,
+    num_speculative_tokens: int,
+    speculative_token_tree: Optional[list[tuple[int]]] = None,
+) -> EagleProposer:
     model_config = ModelConfig(model=model_dir,
                                runner="generate",
                                max_model_len=100)
@@ -31,12 +36,18 @@ def _create_proposer(method: str, k: int) -> EagleProposer:
     # Choose model directory based on method
     draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
 
+    spec_token_tree_str = None
+    if speculative_token_tree is not None:
+        assert num_speculative_tokens == len(speculative_token_tree)
+        spec_token_tree_str = str(speculative_token_tree)
+
     speculative_config = SpeculativeConfig(
         target_model_config=model_config,
         target_parallel_config=ParallelConfig(),
         model=draft_model_dir,
         method=method,
-        num_speculative_tokens=k,
+        num_speculative_tokens=num_speculative_tokens,
+        speculative_token_tree=spec_token_tree_str,
     )
 
     vllm_config = VllmConfig(
@@ -189,7 +200,7 @@ class _TargetModelStub(LlamaForCausalLM):
         target_model.lm_head = mock.MagicMock()
 
     # Create proposer using the helper function
-    proposer = _create_proposer(method, k=8)
+    proposer = _create_proposer(method, num_speculative_tokens=8)
 
     # Call the method under test
     proposer.load_model(target_model)
@@ -226,6 +237,10 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
         pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
                     "multi-token eagle spec decode on current platform")
 
+    if (attn_backend == "TREE_ATTN"):
+        pytest.skip("TREE_ATTN is tested separately in test_propose_tree"
+                    "because it requires special input mocking.")
+
     if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
@@ -378,3 +393,142 @@ def create_deterministic_logits(token_ids):
 
     # Verify all tokens match our expectations
     assert torch.equal(result, expected_tokens)
+
+
+@pytest.mark.parametrize(
+    "spec_token_tree",
+    [
+        [(0, )],  # A single token
+        [(0, ), (0, 0), (0, 0, 0)],  # Chain
+        [(0, ), (1, ), (2, )],  # Parallel
+        [(0, ), (1, ), (2, ), (0, 0), (0, 1), (1, 0), (1, 1), (2, 0),
+         (2, 1)],  # Tree
+    ])
+def test_propose_tree(spec_token_tree):
+    # Get GPU device.
+    device = torch.device(current_platform.device_type)
+
+    # Setup test parameters.
+    batch_size = 2
+    seq_len_1 = 5
+    seq_len_2 = 3
+    total_tokens = seq_len_1 + seq_len_2
+    vocab_size = 100
+    seq_lens = [seq_len_1, seq_len_2]
+    num_speculative_tokens = len(spec_token_tree)
+
+    # Create proposer first so we can use its actual hidden_size.
+    proposer = _create_proposer("eagle",
+                                num_speculative_tokens,
+                                speculative_token_tree=spec_token_tree)
+    # Get the hidden_size from the proposer to ensure consistency.
+    hidden_size = proposer.hidden_size
+
+    # Helper to create deterministic logits that will produce specific tokens
+    def create_deterministic_logits(token_ids, k: int):
+        logits = torch.full((batch_size, vocab_size), -100.0, device=device)
+        for i, token_id in enumerate(token_ids):
+            # Assign decreasing values to the k, consecutive, tokens.
+            for j in range(k):
+                logits[i, token_id + j] = 100.0 - j
+        return logits
+
+    # Mock a model that returns deterministic logits.
+    base_token_ids = torch.tensor([42, 60], dtype=torch.int64, device=device)
+
+    # Skip loading the model and replace it with a mock that returns
+    # deterministic outputs.
+    model_mock = mock.MagicMock()
+
+    # Mock the model forward calls.
+    forward_returns = [(torch.zeros(total_tokens, hidden_size, device=device),
+                        torch.zeros(total_tokens, hidden_size, device=device))]
+    for cu_num_drafts in proposer.cu_drafts_per_level:
+        h_logits = torch.zeros(batch_size * cu_num_drafts,
+                               hidden_size,
+                               device=device)
+        h_states = torch.zeros(batch_size * cu_num_drafts,
+                               hidden_size,
+                               device=device)
+        forward_returns.append((h_logits, h_states))
+    model_mock.side_effect = forward_returns
+
+    # Mock the compute_logits calls.
+    cu_num_drafts_tensor = torch.tensor([0] + proposer.cu_drafts_per_level,
+                                        dtype=torch.int32,
+                                        device=device)
+    logits_returns = []
+    for level, num_children in enumerate(proposer.child_drafts_per_level):
+        token_ids = base_token_ids + cu_num_drafts_tensor[level]
+        level_num_drafts = cu_num_drafts_tensor[
+            level + 1] - cu_num_drafts_tensor[level]
+        level_logits = []
+        for i in range(level_num_drafts // num_children):
+            level_logits.append(
+                create_deterministic_logits(token_ids + i * num_children,
+                                            num_children))
+        logits_returns.append(torch.stack(level_logits, dim=1))
+    model_mock.compute_logits.side_effect = logits_returns
+
+    # Assign the mock to the proposer
+    proposer.model = model_mock
+
+    # Assign draft attn_layer_names since load_model is not invoked
+    proposer.attn_layer_names = ["layer.0"]
+
+    # Get the tree attention metadata builder.
+    attn_metadata_builder_cls, _ = get_attention_backend(_Backend.TREE_ATTN)
+    attn_metadata_builder = attn_metadata_builder_cls(
+        kv_cache_spec=create_standard_kv_cache_spec(proposer.vllm_config),
+        layer_names=proposer.attn_layer_names,
+        vllm_config=proposer.vllm_config,
+        device=device,
+    )
+
+    # Mock runner for attention metadata building.
+    proposer.runner = mock.MagicMock()
+    proposer.runner.attn_groups.append([mock.MagicMock()])
+    proposer.runner.attn_groups[0][0].metadata_builder = attn_metadata_builder
+
+    # Setup inputs for the proposer.
+    target_token_ids = torch.randint(0,
+                                     vocab_size, (total_tokens, ),
+                                     device=device)
+    target_positions = torch.cat([
+        torch.arange(seq_len_1, device=device),
+        torch.arange(seq_len_2, device=device)
+    ])
+    target_hidden_states = torch.randn(total_tokens,
+                                       hidden_size,
+                                       device=device)
+    next_token_ids = torch.randint(0,
+                                   vocab_size, (batch_size, ),
+                                   dtype=torch.int32,
+                                   device=device)
+    batch_spec = BatchSpec(
+        seq_lens=seq_lens,
+        query_lens=seq_lens,
+    )
+    common_attn_metadata = create_common_attn_metadata(
+        batch_spec,
+        block_size=16,
+        device=device,
+    )
+    sampling_metadata = mock.MagicMock()
+
+    # Propose draft tokens.
+    result = proposer.propose(target_token_ids=target_token_ids,
+                              target_positions=target_positions,
+                              target_hidden_states=target_hidden_states,
+                              next_token_ids=next_token_ids,
+                              common_attn_metadata=common_attn_metadata,
+                              sampling_metadata=sampling_metadata)
+    assert result.shape == (batch_size, num_speculative_tokens)
+
+    # The tokens are expected to be consecutive integers starting
+    # from the base token IDs.
+    expected_tokens = base_token_ids[:, None] + torch.arange(
+        num_speculative_tokens, dtype=torch.int64, device=device)
+
+    # Verify that the draft tokens match our expectations.
+    assert torch.equal(result, expected_tokens)
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 01019b29e010..a5b10bb51866 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -39,12 +39,6 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
                        num_speculative_tokens: int, attn_backend: str):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
-
-        if attn_backend == "TREE_ATTN" and num_speculative_tokens > 1:
-            # TREE_ATTN fails the test with multi-token spec decode
-            # TODO: Investigate why
-            pytest.skip("TREE_ATTN fails the test")
-
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         if (attn_backend == "TRITON_ATTN_VLLM_V1"
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 3b53b039f1dc..5d10e9e26082 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -236,9 +236,9 @@ def build_for_drafting(
             # Use prefill for drafting at the root level.
             self.tree_attn_bias = torch.empty(0)
         else:
-            # Slice the tree attention bias for drafting.
-            query_len = common_attn_metadata.max_query_len
-            start, end = draft_index, draft_index + query_len
+            # Slice the tree attention bias for drafting. Exclude
+            # the root level.
+            start, end = 1, 1 + common_attn_metadata.max_query_len
             self.tree_attn_bias = self.tree_attn_bias[start:end,
                                                       start:end].contiguous()
 
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index f75d76dd978f..a8a160a0f995 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -113,13 +113,6 @@ def __init__(
                                             num_drafts_per_level[level])
             self.child_drafts_per_level.append(num_drafts_per_level[level] //
                                                num_drafts_per_level[level - 1])
-        # Find the first level where the tree branches off into one or more
-        # children.
-        self.first_branching_level = None
-        for level in range(tree_depth):
-            if self.cu_drafts_per_level[level] > level + 1:
-                self.first_branching_level = level
-                break
         # Precompute draft position offsets in flattened tree.
         self.tree_draft_pos_offsets = torch.arange(
             1,
@@ -209,11 +202,10 @@ def propose(
         logits = self.model.compute_logits(sample_hidden_states, None)
         positions = target_positions[last_token_indices]
         hidden_states = hidden_states[last_token_indices]
-        if self.first_branching_level == 0:
-            # Branching has occurred at the root level. Draft using tree
-            # attention.
+
+        if isinstance(attn_metadata, TreeAttentionMetadata):
+            # Draft using tree attention.
             draft_token_ids_list = self.propose_tree(
-                tree_root_level=0,
                 batch_size=batch_size,
                 logits=logits,
                 positions=positions,
@@ -242,11 +234,10 @@ def propose(
                 (TritonAttentionMetadata, AiterFlashAttentionMetadata,
                  FlashAttentionMetadata))
         else:
-            # Currently, only FlashAttention and TreeAttention support
-            # multi-token eagle spec decode. This is because the code below
-            # makes assumptions about attn_metadata attributes available.
-            assert isinstance(attn_metadata,
-                              (FlashAttentionMetadata, TreeAttentionMetadata))
+            # Currently, only FlashAttention supports multi-token eagle spec
+            # decode. This is because the code below makes assumptions about
+            # attn_metadata attributes available.
+            assert isinstance(attn_metadata, FlashAttentionMetadata)
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
@@ -259,7 +250,7 @@ def propose(
         attn_metadata.num_actual_tokens = batch_size
         attn_metadata.max_query_len = 1
         attn_metadata.query_start_loc = self.arange[:batch_size + 1]
-        for token_index in range(self.num_speculative_tokens - 1):
+        for _ in range(self.num_speculative_tokens - 1):
             # Update the inputs.
             # cast to int32 is crucial when eagle model is compiled.
             # tensor.argmax() returns int64 by default.
@@ -327,21 +318,6 @@ def propose(
             hidden_states = hidden_states[:batch_size]
             logits = self.model.compute_logits(last_hidden_states[:batch_size],
                                                None)
-
-            if self.first_branching_level == token_index + 1:
-                # Branching has occurred. The remaining tokens are drafted
-                # using tree attention.
-                draft_token_ids_list += self.propose_tree(
-                    tree_root_level=token_index + 1,
-                    batch_size=batch_size,
-                    logits=logits,
-                    positions=positions,
-                    hidden_states=hidden_states,
-                    common_attn_metadata=common_attn_metadata,
-                )
-                # [batch_size, num_tree_tokens]
-                return torch.cat(draft_token_ids_list, dim=1)
-
             draft_token_ids = logits.argmax(dim=-1)
             draft_token_ids_list.append(draft_token_ids)
 
@@ -351,7 +327,6 @@ def propose(
 
     def propose_tree(
         self,
-        tree_root_level: int,
         batch_size: int,
         # [num_tokens, vocab_size]
         logits: torch.Tensor,
@@ -366,10 +341,10 @@ def propose_tree(
         assert isinstance(tree_attn_metadata_builder,
                           TreeAttentionMetadataBuilder)
 
-        total_num_drafts = self.cu_drafts_per_level[tree_root_level]
+        total_num_drafts = self.cu_drafts_per_level[0]
         level_num_drafts = total_num_drafts
         # Sample a draft token for each child at the tree root level.
-        num_children = self.child_drafts_per_level[tree_root_level]
+        num_children = self.child_drafts_per_level[0]
         if num_children == 1:
             draft_token_ids = logits.argmax(dim=-1).view(batch_size, -1)
         else:
@@ -393,22 +368,23 @@ def propose_tree(
             positions.view(batch_size, -1) +
             self.tree_draft_pos_offsets[:batch_size, :])
         tree_depth = len(self.cu_drafts_per_level)
-        for level in range(tree_root_level, tree_depth - 1):
+        for level in range(tree_depth - 1):
             # Get draft positions for RoPE.
             draft_positions = positions + (level + 1)
             exceeds_max_model_len = (positions +
                                      total_num_drafts) >= self.max_model_len
             # Mask out the position ids that exceed the max model length.
             # Otherwise, we may get out-of-range error in RoPE.
-            clamped_draft_positions = torch.where(
+            draft_positions = torch.where(
                 exceeds_max_model_len,
                 0,
                 draft_positions,
-            )
+            ).view(batch_size, -1)
+
             if level_num_drafts > 1:
                 # Repeat the positions for each draft at this level.
-                draft_positions = clamped_draft_positions.repeat_interleave(
-                    level_num_drafts).reshape(batch_size, -1)
+                draft_positions = draft_positions.repeat_interleave(
+                    level_num_drafts, dim=1)
 
             if num_children > 1:
                 # Repeat draft hidden states for each child.
@@ -425,7 +401,7 @@ def propose_tree(
 
             # Build new attention metadata for the next level of drafts.
             # This is necessary to support tree attention.
-            query_len = total_num_drafts - tree_root_level
+            query_len = total_num_drafts
             common_attn_metadata = replace(
                 common_attn_metadata,
                 query_start_loc=query_len * self.arange[:batch_size + 1],
@@ -435,7 +411,7 @@ def propose_tree(
             )
             attn_metadata = tree_attn_metadata_builder.build_for_drafting(
                 common_attn_metadata=common_attn_metadata,
-                draft_index=tree_root_level + 1,
+                draft_index=level + 1,
             )
 
             # Apply new attention metadata to all layers.
@@ -516,7 +492,6 @@ def propose_tree(
             level_num_drafts = self.cu_drafts_per_level[level +
                                                         1] - total_num_drafts
             total_num_drafts = self.cu_drafts_per_level[level + 1]
-
         return draft_token_ids_list
 
     def prepare_inputs(

From 4acdadb91e855d18b03708bbc1960a6c9398d950 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 13 Aug 2025 19:12:00 +0800
Subject: [PATCH 032/233] [Platform] Custom ops support for FusedMoe (#22509)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py        |  3 ++-
 vllm/model_executor/layers/linear.py                 | 12 ++++++------
 .../layers/vocab_parallel_embedding.py               |  4 +++-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 8ef0a805d86c..ddc02168e5c4 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -682,7 +682,8 @@ def determine_expert_map(
     return (local_num_experts, expert_map)
 
 
-class FusedMoE(torch.nn.Module):
+@CustomOp.register("fused_moe")
+class FusedMoE(CustomOp):
     """FusedMoE layer for MoE models.
 
     This layer contains both MergedColumnParallel weights (gate_up_proj /
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index bb81a663d454..75391c51f775 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -16,6 +16,7 @@
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
@@ -226,7 +227,7 @@ def apply(self,
         return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
 
 
-class LinearBase(torch.nn.Module):
+class LinearBase(CustomOp):
     """Base linear layer.
 
     Args:
@@ -269,12 +270,8 @@ def __init__(
                                                               prefix=prefix)
         self.return_bias = return_bias
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
-        raise NotImplementedError
-
 
+@CustomOp.register("replicated_linear")
 class ReplicatedLinear(LinearBase):
     """Replicated linear layer.
 
@@ -443,6 +440,7 @@ def weight_loader(self,
         param[shard_offset:shard_offset + shard_size] = loaded_weight
 
 
+@CustomOp.register("column_parallel_linear")
 class ColumnParallelLinear(LinearBase):
     """Linear layer with column parallelism.
 
@@ -1229,6 +1227,7 @@ def weight_loader(self,
         param_data.copy_(loaded_weight)
 
 
+@CustomOp.register("row_parallel_linear")
 class RowParallelLinear(LinearBase):
     """Linear layer with row parallelism.
 
@@ -1405,6 +1404,7 @@ def extra_repr(self) -> str:
         return s
 
 
+@CustomOp.register("qkv_cross_parallel_linear")
 class QKVCrossParallelLinear(LinearBase):
     """Linear layers for efficient cross-attention's QKV transformation.
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index a5f262c832bf..9f223998e554 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -12,6 +12,7 @@
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
 from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
@@ -159,7 +160,8 @@ def get_masked_input_and_mask(
     return input_, ~vocab_mask
 
 
-class VocabParallelEmbedding(torch.nn.Module):
+@CustomOp.register("vocab_parallel_embedding")
+class VocabParallelEmbedding(CustomOp):
     """Embedding parallelized in the vocabulary dimension.
 
     Adapted from torch.nn.Embedding, note that we pad the vocabulary size to

From 3821bba619cb1a74365752685f286b54d7d98863 Mon Sep 17 00:00:00 2001
From: Kdump <rootshellexp@gmail.com>
Date: Wed, 13 Aug 2025 19:14:24 +0800
Subject: [PATCH 033/233] [Frontend] Add chunked processing to handle long
 inputs in embedding models (#22280)

Signed-off-by: x22x22 <wadeking@qq.com>
Signed-off-by: Kdump <rootshellexp@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../openai_embedding_long_text/README.md      | 186 +++++++
 .../openai_embedding_long_text/client.py      | 366 ++++++++++++++
 .../openai_embedding_long_text/service.sh     | 137 ++++++
 .../openai/test_embedding_long_text.py        | 441 +++++++++++++++++
 vllm/config/__init__.py                       |  19 +
 vllm/entrypoints/openai/serving_embedding.py  | 457 +++++++++++++++++-
 6 files changed, 1603 insertions(+), 3 deletions(-)
 create mode 100644 examples/online_serving/openai_embedding_long_text/README.md
 create mode 100644 examples/online_serving/openai_embedding_long_text/client.py
 create mode 100644 examples/online_serving/openai_embedding_long_text/service.sh
 create mode 100644 tests/entrypoints/openai/test_embedding_long_text.py

diff --git a/examples/online_serving/openai_embedding_long_text/README.md b/examples/online_serving/openai_embedding_long_text/README.md
new file mode 100644
index 000000000000..04edc4680ea0
--- /dev/null
+++ b/examples/online_serving/openai_embedding_long_text/README.md
@@ -0,0 +1,186 @@
+# Long Text Embedding with Chunked Processing
+
+This directory contains examples for using vLLM's **chunked processing** feature to handle long text embedding that exceeds the model's maximum context length.
+
+## 🚀 Quick Start
+
+### Start the Server
+
+Use the provided script to start a vLLM server with chunked processing enabled:
+
+```bash
+# Basic usage (supports very long texts up to ~3M tokens)
+./service.sh
+
+# Custom configuration with different models
+MODEL_NAME="jinaai/jina-embeddings-v3" \
+MAX_EMBED_LEN=1048576 \
+./service.sh
+
+# For extremely long documents
+MODEL_NAME="intfloat/multilingual-e5-large" \
+MAX_EMBED_LEN=3072000 \
+./service.sh
+```
+
+### Test Long Text Embedding
+
+Run the comprehensive test client:
+
+```bash
+python client.py
+```
+
+## 📁 Files
+
+| File | Description |
+|------|-------------|
+| `service.sh` | Server startup script with chunked processing enabled |
+| `client.py` | Comprehensive test client for long text embedding |
+
+## ⚙️ Configuration
+
+### Server Configuration
+
+The key parameters for chunked processing are in the `--override-pooler-config`:
+
+```json
+{
+  "pooling_type": "auto",
+  "normalize": true,
+  "enable_chunked_processing": true,
+  "max_embed_len": 3072000
+}
+```
+
+!!! note
+    `pooling_type` sets the model's own pooling strategy for processing within each chunk. The cross-chunk aggregation automatically uses MEAN strategy when input exceeds the model's native maximum length.
+
+#### Chunked Processing Behavior
+
+Chunked processing uses **MEAN aggregation** for cross-chunk combination when input exceeds the model's native maximum length:
+
+| Component | Behavior | Description |
+|-----------|----------|-------------|
+| **Within chunks** | Model's native pooling | Uses the model's configured pooling strategy |
+| **Cross-chunk aggregation** | Always MEAN | Weighted averaging based on chunk token counts |
+| **Performance** | Optimal | All chunks processed for complete semantic coverage |
+
+### Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MODEL_NAME` | `intfloat/multilingual-e5-large` | Embedding model to use (supports multiple models) |
+| `PORT` | `31090` | Server port |
+| `GPU_COUNT` | `1` | Number of GPUs to use |
+| `MAX_EMBED_LEN` | `3072000` | Maximum embedding input length (supports very long documents) |
+| `POOLING_TYPE` | `auto` | Model's native pooling type: `auto`, `MEAN`, `CLS`, `LAST` (only affects within-chunk pooling, not cross-chunk aggregation) |
+| `API_KEY` | `EMPTY` | API key for authentication |
+
+## 🔧 How It Works
+
+1. **Enhanced Input Validation**: `max_embed_len` allows accepting inputs longer than `max_model_len` without environment variables
+2. **Smart Chunking**: Text is split based on `max_position_embeddings` to maintain semantic integrity
+3. **Unified Processing**: All chunks processed separately through the model using its configured pooling strategy
+4. **MEAN Aggregation**: When input exceeds model's native length, results combined using token count-based weighted averaging across all chunks
+5. **Consistent Output**: Final embeddings maintain the same dimensionality as standard processing
+
+### Input Length Handling
+
+- **Within max_embed_len**: Input is accepted and processed (up to 3M+ tokens)
+- **Exceeds max_position_embeddings**: Chunked processing is automatically triggered
+- **Exceeds max_embed_len**: Input is rejected with clear error message
+- **No environment variables required**: Works without `VLLM_ALLOW_LONG_MAX_MODEL_LEN`
+
+### Extreme Long Text Support
+
+With `MAX_EMBED_LEN=3072000`, you can process:
+
+- **Academic papers**: Full research papers with references
+- **Legal documents**: Complete contracts and legal texts  
+- **Books**: Entire chapters or small books
+- **Code repositories**: Large codebases and documentation
+
+## 📊 Performance Characteristics
+
+### Chunked Processing Performance
+
+| Aspect | Behavior | Performance |
+|--------|----------|-------------|
+| **Chunk Processing** | All chunks processed with native pooling | Consistent with input length |
+| **Cross-chunk Aggregation** | MEAN weighted averaging | Minimal overhead |
+| **Memory Usage** | Proportional to number of chunks | Moderate, scalable |
+| **Semantic Quality** | Complete text coverage | Optimal for long documents |
+
+## 🧪 Test Cases
+
+The test client demonstrates:
+
+- ✅ **Short text**: Normal processing (baseline)
+- ✅ **Medium text**: Single chunk processing
+- ✅ **Long text**: Multi-chunk processing with aggregation
+- ✅ **Very long text**: Many chunks processing
+- ✅ **Extreme long text**: Document-level processing (100K+ tokens)
+- ✅ **Batch processing**: Mixed-length inputs in one request
+- ✅ **Consistency**: Reproducible results across runs
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+1. **Chunked processing not enabled**:
+
+   ```log
+   ValueError: This model's maximum position embeddings length is 4096 tokens...
+   ```
+
+   **Solution**: Ensure `enable_chunked_processing: true` in pooler config
+
+2. **Input exceeds max_embed_len**:
+
+   ```log
+   ValueError: This model's maximum embedding input length is 3072000 tokens...
+   ```
+
+   **Solution**: Increase `max_embed_len` in pooler config or reduce input length
+
+3. **Memory errors**:
+  
+   ```log
+   RuntimeError: CUDA out of memory
+   ```
+  
+   **Solution**: Reduce chunk size by adjusting model's `max_position_embeddings` or use fewer GPUs
+
+4. **Slow processing**:
+   **Expected**: Long text takes more time due to multiple inference calls
+
+### Debug Information
+
+Server logs show chunked processing activity:
+
+```log
+INFO: Input length 150000 exceeds max_position_embeddings 4096, will use chunked processing
+INFO: Split input of 150000 tokens into 37 chunks (max_chunk_size: 4096)
+```
+
+## 🤝 Contributing
+
+To extend chunked processing support to other embedding models:
+
+1. Check model compatibility with the pooling architecture
+2. Test with various text lengths
+3. Validate embedding quality compared to single-chunk processing
+4. Submit PR with test cases and documentation updates
+
+## 🆕 Enhanced Features
+
+### max_embed_len Parameter
+
+The new `max_embed_len` parameter provides:
+
+- **Simplified Configuration**: No need for `VLLM_ALLOW_LONG_MAX_MODEL_LEN` environment variable
+- **Flexible Input Validation**: Accept inputs longer than `max_model_len` up to `max_embed_len`
+- **Extreme Length Support**: Process documents with millions of tokens
+- **Clear Error Messages**: Better feedback when inputs exceed limits
+- **Backward Compatibility**: Existing configurations continue to work
diff --git a/examples/online_serving/openai_embedding_long_text/client.py b/examples/online_serving/openai_embedding_long_text/client.py
new file mode 100644
index 000000000000..6e9838ac6d8d
--- /dev/null
+++ b/examples/online_serving/openai_embedding_long_text/client.py
@@ -0,0 +1,366 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Example script demonstrating long text embedding with chunked processing in vLLM.
+
+This example shows how to use vLLM's chunked processing feature to handle text
+inputs that exceed the model's maximum token length. The feature automatically
+splits long text into chunks and handles different pooling types optimally.
+
+Prerequisites:
+1. Start vLLM server with chunked processing enabled:
+   
+   # MEAN pooling (processes all chunks, recommended for complete coverage)
+   vllm serve intfloat/multilingual-e5-large \
+     --override-pooler-config \
+      '{"pooling_type": "MEAN", "normalize": true, ' \
+      '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
+     --served-model-name multilingual-e5-large \
+     --trust-remote-code \
+     --port 31090 \
+     --api-key your-api-key
+
+   # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
+   vllm serve BAAI/bge-large-en-v1.5 \
+     --override-pooler-config \
+      '{"pooling_type": "CLS", "normalize": true, ' \
+      '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
+     --served-model-name bge-large-en-v1.5 \
+     --trust-remote-code \
+     --port 31090 \
+     --api-key your-api-key
+
+2. Install required dependencies:
+   pip install openai requests
+"""
+
+import time
+
+import numpy as np
+from openai import OpenAI
+
+# Configuration
+API_KEY = "your-api-key"  # Replace with your actual API key
+BASE_URL = "http://localhost:31090/v1"
+MODEL_NAME = "multilingual-e5-large"
+
+
+def generate_long_text(base_text: str, repeat_count: int) -> str:
+    """Generate long text by repeating base text."""
+    return base_text * repeat_count
+
+
+def test_embedding_with_different_lengths():
+    """Test embedding generation with different text lengths."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    # Test cases with different text lengths
+    test_cases = [
+        {
+            "name": "Short Text",
+            "text": "Hello, this is a short text for embedding.",
+            "expected_chunks": 1,
+        },
+        {
+            "name": "Medium Text",
+            "text": generate_long_text(
+                "This is a medium-length text that should fit within the "
+                "model's context window. " * 20,
+                2,
+            ),
+            "expected_chunks": 1,
+        },
+        {
+            "name": "Long Text (2 chunks)",
+            "text": generate_long_text(
+                "This is a very long text that will exceed the model's "
+                "maximum context length and trigger chunked processing. " * 50,
+                5,
+            ),
+            "expected_chunks": 2,
+        },
+        {
+            "name": "Very Long Text (3+ chunks)",
+            "text": generate_long_text(
+                "This text is extremely long and will definitely "
+                "require multiple chunks for processing. " * 100,
+                10,
+            ),
+            "expected_chunks": 3,
+        },
+    ]
+
+    print("🧪 Testing vLLM Long Text Embedding with Chunked Processing")
+    print("=" * 70)
+
+    for i, test_case in enumerate(test_cases, 1):
+        print(f"\n📝 Test {i}: {test_case['name']}")
+        print(f"Text length: {len(test_case['text'])} characters")
+
+        try:
+            start_time = time.time()
+
+            response = client.embeddings.create(
+                input=test_case["text"], model=MODEL_NAME, encoding_format="float"
+            )
+
+            end_time = time.time()
+            processing_time = end_time - start_time
+
+            # Extract embedding data
+            embedding = response.data[0].embedding
+            embedding_dim = len(embedding)
+
+            print("✅ Success!")
+            print(f"   - Embedding dimension: {embedding_dim}")
+            print(f"   - Processing time: {processing_time:.2f}s")
+            print(f"   - Expected chunks: ~{test_case['expected_chunks']}")
+            print(f"   - First 5 values: {embedding[:5]}")
+
+        except Exception as e:
+            print(f"❌ Failed: {str(e)}")
+
+
+def test_batch_embedding():
+    """Test batch embedding with mixed-length inputs."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔄 Testing Batch Embedding with Mixed Lengths")
+    print("=" * 50)
+
+    # Mix of short and long texts
+    batch_inputs = [
+        "Short text 1",
+        generate_long_text("Medium length text that fits in one chunk. " * 20, 1),
+        "Another short text",
+        generate_long_text("Long text requiring chunked processing. " * 100, 5),
+    ]
+
+    try:
+        start_time = time.time()
+
+        response = client.embeddings.create(
+            input=batch_inputs, model=MODEL_NAME, encoding_format="float"
+        )
+
+        end_time = time.time()
+        processing_time = end_time - start_time
+
+        print("✅ Batch processing successful!")
+        print(f"   - Number of inputs: {len(batch_inputs)}")
+        print(f"   - Number of embeddings: {len(response.data)}")
+        print(f"   - Total processing time: {processing_time:.2f}s")
+        print(
+            f"   - Average time per input: {processing_time / len(batch_inputs):.2f}s"
+        )
+
+        for i, data in enumerate(response.data):
+            input_length = len(batch_inputs[i])
+            embedding_dim = len(data.embedding)
+            print(
+                f"   - Input {i + 1}: {input_length} chars → {embedding_dim}D embedding"
+            )
+
+    except Exception as e:
+        print(f"❌ Batch processing failed: {str(e)}")
+
+
+def test_multiple_long_texts_batch():
+    """Test batch processing with multiple long texts to verify chunk ID uniqueness."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔧 Testing Multiple Long Texts in Batch (Chunk ID Fix Verification)")
+    print("=" * 70)
+
+    # Create multiple distinct long texts that will all require chunking
+    # Note: All pooling types now use MEAN aggregation across chunks:
+    # - Native pooling (MEAN/CLS/LAST) is used within each chunk
+    # - MEAN aggregation combines results across all chunks
+    # - Full semantic coverage for all pooling types
+    long_texts = [
+        generate_long_text(
+            "First long document about artificial intelligence and machine learning. "
+            * 80,
+            6,
+        ),
+        generate_long_text(
+            "Second long document about natural language processing and transformers. "
+            * 80,
+            6,
+        ),
+        generate_long_text(
+            "Third long document about computer vision and neural networks. " * 80, 6
+        ),
+    ]
+
+    # Add some short texts to mix things up
+    batch_inputs = [
+        "Short text before long texts",
+        long_texts[0],
+        "Short text between long texts",
+        long_texts[1],
+        long_texts[2],
+        "Short text after long texts",
+    ]
+
+    print("📊 Batch composition:")
+    for i, text in enumerate(batch_inputs):
+        length = len(text)
+        text_type = "Long (will be chunked)" if length > 5000 else "Short"
+        print(f"   - Input {i + 1}: {length} chars ({text_type})")
+
+    try:
+        start_time = time.time()
+
+        response = client.embeddings.create(
+            input=batch_inputs, model=MODEL_NAME, encoding_format="float"
+        )
+
+        end_time = time.time()
+        processing_time = end_time - start_time
+
+        print("\n✅ Multiple long texts batch processing successful!")
+        print(f"   - Number of inputs: {len(batch_inputs)}")
+        print(f"   - Number of embeddings returned: {len(response.data)}")
+        print(f"   - Total processing time: {processing_time:.2f}s")
+
+        # Verify each embedding is different (no incorrect aggregation)
+        embeddings = [data.embedding for data in response.data]
+
+        if len(embeddings) >= 3:
+            import numpy as np
+
+            # Compare embeddings of the long texts (indices 1, 3, 4)
+            long_embeddings = [
+                np.array(embeddings[1]),  # First long text
+                np.array(embeddings[3]),  # Second long text
+                np.array(embeddings[4]),  # Third long text
+            ]
+
+            print("\n🔍 Verifying embedding uniqueness:")
+            for i in range(len(long_embeddings)):
+                for j in range(i + 1, len(long_embeddings)):
+                    cosine_sim = np.dot(long_embeddings[i], long_embeddings[j]) / (
+                        np.linalg.norm(long_embeddings[i])
+                        * np.linalg.norm(long_embeddings[j])
+                    )
+                    print(
+                        f"   - Similarity between long text {i + 1} and {j + 1}: "
+                        f"{cosine_sim:.4f}"
+                    )
+
+                    if (
+                        cosine_sim < 0.9
+                    ):  # Different content should have lower similarity
+                        print("     ✅ Good: Embeddings are appropriately different")
+                    else:
+                        print(
+                            "     ⚠️ High similarity - may indicate chunk "
+                            "aggregation issue"
+                        )
+
+        print("\n📋 Per-input results:")
+        for i, data in enumerate(response.data):
+            input_length = len(batch_inputs[i])
+            embedding_dim = len(data.embedding)
+            embedding_norm = np.linalg.norm(data.embedding)
+            print(
+                f"   - Input {i + 1}: {input_length} chars → {embedding_dim}D "
+                f"embedding (norm: {embedding_norm:.4f})"
+            )
+
+        print(
+            "\n✅ This test verifies the fix for chunk ID collisions in "
+            "batch processing"
+        )
+        print("   - Before fix: Multiple long texts would have conflicting chunk IDs")
+        print("   - After fix: Each prompt's chunks have unique IDs with prompt index")
+
+    except Exception as e:
+        print(f"❌ Multiple long texts batch test failed: {str(e)}")
+        print("   This might indicate the chunk ID collision bug is present!")
+
+
+def test_embedding_consistency():
+    """Test that chunked processing produces consistent results."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔍 Testing Embedding Consistency")
+    print("=" * 40)
+
+    # Use the same long text multiple times
+    long_text = generate_long_text(
+        "Consistency test text for chunked processing validation. " * 50, 3
+    )
+
+    embeddings = []
+
+    try:
+        for i in range(3):
+            response = client.embeddings.create(
+                input=long_text, model=MODEL_NAME, encoding_format="float"
+            )
+            embeddings.append(response.data[0].embedding)
+            print(f"   - Generated embedding {i + 1}")
+
+        # Check consistency (embeddings should be identical)
+        if len(embeddings) >= 2:
+            # Calculate similarity between first two embeddings
+
+            emb1 = np.array(embeddings[0])
+            emb2 = np.array(embeddings[1])
+
+            # Cosine similarity
+            cosine_sim = np.dot(emb1, emb2) / (
+                np.linalg.norm(emb1) * np.linalg.norm(emb2)
+            )
+
+            print("✅ Consistency test completed!")
+            print(f"   - Cosine similarity between runs: {cosine_sim:.6f}")
+            print("   - Expected: ~1.0 (identical embeddings)")
+
+            if cosine_sim > 0.999:
+                print("   - ✅ High consistency achieved!")
+            else:
+                print("   - ⚠️ Consistency may vary due to numerical precision")
+
+    except Exception as e:
+        print(f"❌ Consistency test failed: {str(e)}")
+
+
+def main():
+    """Main function to run all tests."""
+    print("🚀 vLLM Long Text Embedding Client")
+    print(f"📡 Connecting to: {BASE_URL}")
+    print(f"🤖 Model: {MODEL_NAME}")
+    masked_key = "*" * (len(API_KEY) - 4) + API_KEY[-4:] if len(API_KEY) > 4 else "****"
+    print(f"🔑 API Key: {masked_key}")
+
+    # Run all test cases
+    test_embedding_with_different_lengths()
+    test_batch_embedding()
+    test_multiple_long_texts_batch()
+    test_embedding_consistency()
+
+    print("\n" + "=" * 70)
+    print("🎉 All tests completed!")
+    print("\n💡 Key Features Demonstrated:")
+    print("   - ✅ Automatic chunked processing for long text")
+    print("   - ✅ Seamless handling of mixed-length batches")
+    print("   - ✅ Multiple long texts in single batch (chunk ID fix)")
+    print("   - ✅ Unified chunked processing:")
+    print("     • Native pooling used within each chunk")
+    print("     • MEAN aggregation across all chunks")
+    print("     • Complete semantic coverage for all pooling types")
+    print("   - ✅ Consistent embedding generation")
+    print("   - ✅ Backward compatibility with short text")
+    print("\n📚 For more information, see:")
+    print(
+        "   - Documentation: https://docs.vllm.ai/en/latest/models/pooling_models.html"
+    )
+    print("   - Chunked Processing Guide: openai_embedding_long_text.md")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh
new file mode 100644
index 000000000000..f356d7d4529e
--- /dev/null
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# vLLM Embedding Server with Enhanced Chunked Processing
+# This script starts a vLLM server with chunked processing enabled for long text embedding.
+# Now supports proper pooling type validation and model-specific configurations.
+
+set -euo pipefail
+
+# Configuration
+MODEL_NAME=${MODEL_NAME:-"intfloat/multilingual-e5-large"}
+MODEL_CODE=${MODEL_CODE:-"multilingual-e5-large"}
+
+PORT=${PORT:-31090}
+GPU_COUNT=${GPU_COUNT:-1}
+MAX_EMBED_LEN=${MAX_EMBED_LEN:-3072000}
+API_KEY=${API_KEY:-"your-api-key"}
+
+# Enhanced pooling configuration with model-specific defaults
+POOLING_TYPE=${POOLING_TYPE:-"auto"}  # auto, MEAN, CLS, LAST
+export VLLM_ENABLE_CHUNKED_PROCESSING=true
+export CUDA_VISIBLE_DEVICES=2,3,4,5
+# export VLLM_ATTENTION_BACKEND=XFORMERS
+
+echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
+echo "=================================================================="
+
+# Environment variables for optimization
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+# Function to determine optimal pooling type for known models
+get_optimal_pooling_type() {
+    local model="$1"
+    case "$model" in
+        *"e5-"* | *"multilingual-e5"*)
+            echo "MEAN"  # E5 series native pooling
+            ;;
+        *"bge-"*)
+            echo "CLS"   # BGE series native pooling
+            ;;
+        *"gte-"*)
+            echo "LAST"  # GTE series native pooling
+            ;;
+        *"sentence-t5"* | *"st5"*)
+            echo "MEAN"  # Sentence-T5 native pooling
+            ;;
+        *"jina-embeddings"*)
+            echo "MEAN"  # Jina embeddings native pooling
+            ;;
+        *"Qwen"*"Embedding"*)
+            echo "LAST"  # Qwen embeddings native pooling
+            ;;
+        *)
+            echo "MEAN"  # Default native pooling for unknown models
+            ;;
+    esac
+}
+
+# Auto-detect pooling type if not explicitly set
+if [ "$POOLING_TYPE" = "auto" ]; then
+    POOLING_TYPE=$(get_optimal_pooling_type "$MODEL_NAME")
+    echo "🔍 Auto-detected pooling type: $POOLING_TYPE for model $MODEL_NAME"
+fi
+
+# Display configuration
+echo "📋 Configuration:"
+echo "   - Model: $MODEL_NAME"
+echo "   - Port: $PORT"
+echo "   - GPU Count: $GPU_COUNT"
+echo "   - Enhanced Chunked Processing: ${VLLM_ENABLE_CHUNKED_PROCESSING}"
+echo "   - Max Embed Length: ${MAX_EMBED_LEN} tokens"
+echo "   - Native Pooling Type: $POOLING_TYPE + Normalization"
+echo "   - Cross-chunk Aggregation: MEAN (automatic)"
+echo ""
+
+# Validate GPU availability
+if command -v nvidia-smi &> /dev/null; then
+    gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    echo "🖥️  Available GPUs: $gpu_count"
+    if [ "$GPU_COUNT" -gt "$gpu_count" ]; then
+        echo "⚠️  Warning: Requested $GPU_COUNT GPUs but only $gpu_count available"
+        echo "   Adjusting to use $gpu_count GPUs"
+        GPU_COUNT=$gpu_count
+    fi
+else
+    echo "⚠️  Warning: nvidia-smi not found. GPU detection skipped."
+fi
+
+# Chunked processing uses unified MEAN aggregation
+echo "ℹ️  Chunked Processing: Using $POOLING_TYPE pooling within chunks, MEAN aggregation across chunks"
+echo "   - All chunks processed for complete semantic coverage"
+echo "   - Weighted averaging based on chunk token counts"
+
+echo ""
+echo "🔧 Starting server with enhanced chunked processing configuration..."
+
+# Build pooler config JSON
+POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
+
+# Start vLLM server with enhanced chunked processing
+vllm serve "$MODEL_NAME" \
+  --tensor-parallel-size "$GPU_COUNT" \
+  --enforce-eager \
+  --override-pooler-config "$POOLER_CONFIG" \
+  --served-model-name ${MODEL_CODE} \
+  --api-key "$API_KEY" \
+  --trust-remote-code \
+  --port "$PORT" \
+  --host 0.0.0.0
+
+echo ""
+echo "✅ vLLM Embedding Server started successfully!"
+echo ""
+echo "📡 Server Information:"
+echo "   - Base URL: http://localhost:$PORT"
+echo "   - Model Code: ${MODEL_CODE}"
+echo "   - API Key: $API_KEY"
+echo "   - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
+echo ""
+echo "🧪 Test the server with:"
+echo "   python examples/online_serving/openai_embedding_long_text_client.py"
+echo ""
+echo "📚 Enhanced features enabled:"
+echo "   ✅ Intelligent native pooling type detection"
+echo "   ✅ Unified MEAN aggregation for chunked processing"
+echo "   ✅ Model-specific native pooling optimization"
+echo "   ✅ Enhanced max embedding length (${MAX_EMBED_LEN} tokens)"
+echo "   ✅ Complete semantic coverage for all pooling types"
+echo "   ✅ OpenAI-compatible API"
+echo "   ✅ GPU acceleration"
+echo ""
+echo "🔧 Advanced usage:"
+echo "   - Set POOLING_TYPE=MEAN|CLS|LAST to override auto-detection"
+echo "   - Set MAX_EMBED_LEN to adjust maximum input length"
+echo "   - All pooling types use MEAN aggregation across chunks" 
diff --git a/tests/entrypoints/openai/test_embedding_long_text.py b/tests/entrypoints/openai/test_embedding_long_text.py
new file mode 100644
index 000000000000..86bd34abb97e
--- /dev/null
+++ b/tests/entrypoints/openai/test_embedding_long_text.py
@@ -0,0 +1,441 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test cases for long text embedding with automatic chunking mechanism.
+
+This test suite validates vLLM's automatic chunking functionality for handling
+text inputs that exceed the model's maximum token length, specifically targeting
+the intfloat/multilingual-e5-small model (max token length: 512).
+"""
+
+import random
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+
+from ...utils import RemoteOpenAIServer
+
+
+def _generate_random_text(word_count: int) -> str:
+    """Generate random text with approximately the specified word count."""
+    # Common English words with focus on verbs and nouns for realistic text
+    common_words = [
+        # Essential articles and pronouns (minimal)
+        "the",
+        "and",
+        "you",
+        "they",
+        "this",
+        "that",
+        "these",
+        "those",
+
+        # Action verbs
+        "create",
+        "build",
+        "develop",
+        "design",
+        "implement",
+        "execute",
+        "analyze",
+        "process",
+        "generate",
+        "calculate",
+        "evaluate",
+        "optimize",
+        "transform",
+        "integrate",
+        "configure",
+        "deploy",
+        "monitor",
+        "manage",
+        "discover",
+        "explore",
+        "investigate",
+        "research",
+        "study",
+        "examine",
+        "improve",
+        "enhance",
+        "upgrade",
+        "modify",
+        "update",
+        "maintain",
+        "solve",
+        "resolve",
+        "handle",
+        "address",
+        "tackle",
+        "overcome",
+        "communicate",
+        "collaborate",
+        "coordinate",
+        "organize",
+        "plan",
+        "achieve",
+        "accomplish",
+        "complete",
+        "finish",
+        "deliver",
+        "provide",
+
+        # Technology and science nouns
+        "system",
+        "application",
+        "software",
+        "hardware",
+        "network",
+        "database",
+        "algorithm",
+        "model",
+        "framework",
+        "platform",
+        "interface",
+        "protocol",
+        "architecture",
+        "infrastructure",
+        "component",
+        "module",
+        "service",
+        "technology",
+        "innovation",
+        "solution",
+        "methodology",
+        "approach",
+        "artificial",
+        "intelligence",
+        "machine",
+        "learning",
+        "neural",
+        "network",
+        "computer",
+        "processor",
+        "memory",
+        "storage",
+        "computation",
+        "data",
+        "information",
+        "knowledge",
+        "insight",
+        "pattern",
+        "trend",
+        "analysis",
+        "research",
+        "development",
+        "engineering",
+        "science",
+        "mathematics",
+        "statistics",
+        "probability",
+        "optimization",
+        "performance",
+        "efficiency",
+
+        # General nouns
+        "project",
+        "team",
+        "organization",
+        "company",
+        "business",
+        "industry",
+        "market",
+        "customer",
+        "user",
+        "client",
+        "product",
+        "feature",
+        "function",
+        "requirement",
+        "specification",
+        "documentation",
+        "report",
+        "result",
+        "outcome",
+        "impact",
+        "benefit",
+        "advantage",
+        "challenge",
+        "problem",
+        "opportunity",
+        "strategy",
+        "goal",
+        "objective",
+        "target",
+        "milestone",
+        "process",
+        "procedure",
+        "workflow",
+        "pipeline",
+        "operation",
+        "task",
+        "activity",
+        "event",
+        "session",
+        "meeting",
+        "discussion",
+        "decision"
+    ]
+
+    words = []
+    for _ in range(word_count):
+        words.append(random.choice(common_words))
+
+    # Add some punctuation for more realistic text
+    text = " ".join(words)
+    # Add periods every 10-20 words
+    words_list = text.split()
+    result = []
+    for i, word in enumerate(words_list):
+        result.append(word)
+        if ((i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1):
+            result[-1] += "."
+
+    return " ".join(result)
+
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+DTYPE = "bfloat16"
+
+# Test text: Generate text with approximately 1500 words to exceed 1024 tokens
+LONG_TEXT_1500_WORDS = _generate_random_text(1500)
+
+# Test text: Generate text with approximately 2500 words to exceed 2048 tokens
+LONG_TEXT_2500_WORDS = _generate_random_text(2500)
+
+
+@pytest.fixture(scope="module")
+def server_with_chunked_processing():
+    """Start server with automatic chunking processing enabled."""
+    args = [
+        "--runner",
+        "pooling",
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",  # Set smaller max_model_len to trigger chunking mechanism
+        '--override-pooler-config',
+        ('{"pooling_type": "MEAN", "normalize": true, '
+         '"enable_chunked_processing": true, "max_embed_len": 10000}'),
+        "--gpu-memory-utilization",
+        "0.8",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_with_chunked_processing(server_with_chunked_processing):
+    """Create async client with chunking processing support."""
+    async with server_with_chunked_processing.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_long_text_embedding_1500_chars(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test embedding processing for ~1500 character long text 
+    (~1028 tokens, exceeding 512 token limit)."""
+
+    # Verify text length
+    # Verify text has sufficient word count (approximately 1500 words)
+    word_count = len(LONG_TEXT_1500_WORDS.split())
+    assert word_count >= 1400, (
+        f"Test text word count insufficient: {word_count} words")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_1500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding
+               ) == 384  # multilingual-e5-small embedding dimension
+    assert embeddings.usage.completion_tokens == 0
+    # Due to chunked processing, token count should
+    # reflect actual processed tokens
+    # With ~1500 words, we expect roughly
+    # 1024+ tokens (exceeding 512 token limit)
+    # Should exceed single chunk limit of 512
+    assert embeddings.usage.prompt_tokens > 800
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # Verify embedding vector validity
+    embedding_vector = embeddings.data[0].embedding
+    assert all(
+        isinstance(x, float)
+        for x in embedding_vector), "Embedding vector should contain floats"
+    assert not all(
+        x == 0
+        for x in embedding_vector), "Embedding vector should not be all zeros"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_long_text_embedding_2500_chars(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test embedding processing for ~2500 character long text
+    (~2048 tokens, requiring multiple chunks)."""
+
+    # Verify text length
+    # Verify text has sufficient word count (approximately 2500 words)
+    word_count = len(LONG_TEXT_2500_WORDS.split())
+    assert word_count >= 2300, (
+        f"Test text word count insufficient: {word_count} words")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_2500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding
+               ) == 384  # multilingual-e5-small embedding dimension
+    assert embeddings.usage.completion_tokens == 0
+    # Due to chunked processing, token count should
+    # reflect actual processed tokens
+    # With ~2500 words, we expect
+    # roughly 2048+ tokens (requiring multiple chunks)
+    # Should require multiple chunks for processing
+    assert embeddings.usage.prompt_tokens > 1500
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # Verify embedding vector validity
+    embedding_vector = embeddings.data[0].embedding
+    assert all(
+        isinstance(x, float)
+        for x in embedding_vector), "Embedding vector should contain floats"
+    assert not all(
+        x == 0
+        for x in embedding_vector), "Embedding vector should not be all zeros"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_long_text_embedding(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test batch long text embedding processing."""
+
+    input_texts = [
+        LONG_TEXT_1500_WORDS,
+        LONG_TEXT_2500_WORDS,
+        "This is a short text test.",  # Short text for comparison
+    ]
+
+    # Send batch embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3  # Three input texts
+
+    # Verify each embedding dimension
+    for i, embedding_data in enumerate(embeddings.data):
+        assert len(embedding_data.embedding) == 384
+        assert embedding_data.index == i
+
+        # Verify embedding vector validity
+        embedding_vector = embedding_data.embedding
+        assert all(isinstance(x, float) for x in embedding_vector)
+        assert not all(x == 0 for x in embedding_vector)
+
+    # Verify token usage
+    assert embeddings.usage.completion_tokens == 0
+    # Total token count should be very substantial
+    assert embeddings.usage.prompt_tokens > 1000
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chunked_vs_normal_consistency(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test consistency between chunked and
+    normal processing (using short text)."""
+
+    # Use a short text within the 512 token limit
+    short_text = ("Artificial intelligence technology is changing our world, "
+                  "bringing unprecedented opportunities and challenges.")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[short_text],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    # Short text should not require chunked processing
+    assert embeddings.usage.prompt_tokens < 512
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # 验证embedding向量的有效性
+    embedding_vector = embeddings.data[0].embedding
+    assert all(isinstance(x, float) for x in embedding_vector)
+    assert not all(x == 0 for x in embedding_vector)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chunked_processing_response_format(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test response format and structure during chunked processing."""
+
+    # Test with long text to trigger chunking
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_1500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert embeddings.data[0].object == "embedding"
+    assert embeddings.data[0].index == 0
+
+    # Verify embedding vector properties
+    embedding_vector = embeddings.data[0].embedding
+    import math
+    vector_norm = math.sqrt(sum(x * x for x in embedding_vector))
+    # Check that the vector is normalized
+    # (default behavior for most embedding models)
+    assert 0.8 < vector_norm < 1.2, (
+        f"Vector norm should be reasonable, actual: {vector_norm}")
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 6649cd89ee34..b4ea15ef5a0f 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2598,6 +2598,25 @@ class PoolerConfig:
     ``math-shepherd-mistral-7b-prm`` model.
     """
 
+    enable_chunked_processing: Optional[bool] = None
+    """
+    Whether to enable chunked processing for long inputs that exceed the model's
+    maximum position embeddings. When enabled, long inputs will be split into
+    chunks, processed separately, and then aggregated using weighted averaging.
+    This allows embedding models to handle arbitrarily long text without CUDA
+    errors. Defaults to False.
+    """
+
+    max_embed_len: Optional[int] = None
+    """
+    Maximum input length allowed for embedding generation. When set, allows 
+    inputs longer than max_embed_len to be accepted for embedding models.
+    This parameter enables accepting long inputs without requiring 
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
+    max_embed_len, it will be handled according to the original max_model_len
+    validation logic. Defaults to None (i.e. set to max_model_len).
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 84ba00873103..9dcad8e391c6 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
-from typing import Final, Literal, Optional, Union, cast
+from collections.abc import AsyncGenerator, Mapping
+from typing import Any, Final, Literal, Optional, Union, cast
 
 import numpy as np
+import torch
 from fastapi import Request
 from typing_extensions import assert_never, override
 
@@ -12,19 +14,28 @@
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
+# yapf conflicts with isort for this docstring
+# yapf: disable
 from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
+                                              EmbeddingCompletionRequest,
                                               EmbeddingRequest,
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (EmbeddingServeContext,
                                                     OpenAIServing,
-                                                    ServeContext)
+                                                    RequestPrompt,
+                                                    ServeContext,
+                                                    TextTokensPrompt)
+# yapf: enable
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput,
-                          PoolingRequestOutput)
+                          PoolingOutput, PoolingRequestOutput, RequestOutput)
 from vllm.pooling_params import PoolingParams
+from vllm.utils import chunk_list
 
 logger = init_logger(__name__)
 
@@ -46,6 +57,17 @@ def _get_embedding(
 
 class EmbeddingMixin(OpenAIServing):
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        pooler_config = self.model_config.pooler_config
+
+        # Avoid repeated attribute lookups
+        self.supports_chunked_processing = bool(
+            pooler_config and pooler_config.enable_chunked_processing)
+        self.max_embed_len = (pooler_config.max_embed_len if pooler_config
+                              and pooler_config.max_embed_len else None)
+
     @override
     async def _preprocess(
         self,
@@ -129,6 +151,435 @@ def _build_response(
             usage=usage,
         )
 
+    def _get_max_position_embeddings(self) -> int:
+        """Get the model's effective maximum sequence length for chunking."""
+        return self.model_config.max_model_len
+
+    def _should_use_chunked_processing(self, request) -> bool:
+        """Check if chunked processing should be used for this request."""
+        return isinstance(
+            request,
+            (EmbeddingCompletionRequest,
+             EmbeddingChatRequest)) and self.supports_chunked_processing
+
+    async def _process_chunked_request(
+        self,
+        ctx: EmbeddingServeContext,
+        original_prompt: TextTokensPrompt,
+        pooling_params,
+        trace_headers,
+        prompt_idx: int,
+    ) -> list[AsyncGenerator[PoolingRequestOutput, None]]:
+        """Process a single prompt using chunked processing."""
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
+        token_ids = original_prompt["prompt_token_ids"]
+
+        # Split into chunks using max_position_embeddings
+        max_pos_embeddings = self._get_max_position_embeddings()
+        # Process all chunks for MEAN aggregation
+        for chunk_idx, chunk_tokens in enumerate(
+                chunk_list(token_ids, max_pos_embeddings)):
+            # Create a request ID for this chunk
+            chunk_request_id = (f"{ctx.request_id}-prompt-{prompt_idx}-"
+                                f"chunk-{chunk_idx}")
+
+            # Create engine prompt for this chunk
+            chunk_engine_prompt = EngineTokensPrompt(
+                prompt_token_ids=chunk_tokens)
+
+            # Create chunk request prompt for logging
+            chunk_text = ""
+            chunk_request_prompt = TextTokensPrompt(
+                prompt=chunk_text, prompt_token_ids=chunk_tokens)
+
+            # Log the chunk
+            self._log_inputs(chunk_request_id,
+                             chunk_request_prompt,
+                             params=pooling_params,
+                             lora_request=ctx.lora_request)
+
+            # Create generator for this chunk and wrap it to return indices
+            original_generator = self.engine_client.encode(
+                chunk_engine_prompt,
+                pooling_params,
+                chunk_request_id,
+                lora_request=ctx.lora_request,
+                trace_headers=trace_headers,
+                priority=getattr(ctx.request, "priority", 0),
+            )
+
+            generators.append(original_generator)
+
+        return generators
+
+    def _validate_input(
+        self,
+        request,
+        input_ids: list[int],
+        input_text: str,
+    ) -> TextTokensPrompt:
+        """Override to support chunked processing for embedding requests."""
+        token_num = len(input_ids)
+
+        # Note: EmbeddingRequest doesn't have max_tokens
+        if isinstance(request,
+                      (EmbeddingCompletionRequest, EmbeddingChatRequest)):
+            # Check if chunked processing is enabled for pooling models
+            enable_chunked = self._should_use_chunked_processing(request)
+
+            # Use max_position_embeddings for chunked processing decisions
+            max_pos_embeddings = self._get_max_position_embeddings()
+
+            # Determine the effective max length for validation
+            if self.max_embed_len is not None:
+                # Use max_embed_len for validation instead of max_model_len
+                length_type = "maximum embedding input length"
+                max_length_value = self.max_embed_len
+            else:
+                # Fall back to max_model_len validation (original behavior)
+                length_type = "maximum context length"
+                max_length_value = self.max_model_len
+
+            validation_error_msg = (
+                "This model's {length_type} is {max_length_value} tokens. "
+                "However, you requested {token_num} tokens in the input for "
+                "embedding generation. Please reduce the length of the input.")
+
+            chunked_processing_error_msg = (
+                "This model's {length_type} is {max_length_value} tokens. "
+                "However, you requested {token_num} tokens in the input for "
+                "embedding generation. Please reduce the length of the input "
+                "or enable chunked processing.")
+
+            # Check if input exceeds max length
+            if token_num > max_length_value:
+                raise ValueError(
+                    validation_error_msg.format(
+                        length_type=length_type,
+                        max_length_value=max_length_value,
+                        token_num=token_num))
+
+            # Check for chunked processing
+            # when exceeding max_position_embeddings
+            if token_num > max_pos_embeddings:
+                if enable_chunked:
+                    # Allow long inputs when chunked processing is enabled
+                    logger.info(
+                        "Input length %s exceeds max_position_embeddings "
+                        "%s, will use chunked processing", token_num,
+                        max_pos_embeddings)
+                else:
+                    raise ValueError(
+                        chunked_processing_error_msg.format(
+                            length_type="maximum position embeddings length",
+                            max_length_value=max_pos_embeddings,
+                            token_num=token_num))
+
+            return TextTokensPrompt(prompt=input_text,
+                                    prompt_token_ids=input_ids)
+
+        # For other request types, use the parent's implementation
+        return super()._validate_input(request, input_ids, input_text)
+
+    def _is_text_tokens_prompt(self, prompt) -> bool:
+        """Check if a prompt is a TextTokensPrompt (has prompt_token_ids)."""
+        return (isinstance(prompt, dict) and "prompt_token_ids" in prompt
+                and "prompt_embeds" not in prompt)
+
+    async def _create_single_prompt_generator(
+        self,
+        ctx: EmbeddingServeContext,
+        engine_prompt: Union[EngineTokensPrompt, EngineEmbedsPrompt],
+        request_prompt: RequestPrompt,
+        pooling_params: PoolingParams,
+        trace_headers: Optional[Mapping[str, str]],
+        prompt_index: int,
+    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+        """Create a generator for a single prompt using standard processing."""
+        request_id_item = f"{ctx.request_id}-{prompt_index}"
+
+        self._log_inputs(request_id_item,
+                         request_prompt,
+                         params=pooling_params,
+                         lora_request=ctx.lora_request)
+
+        # Mypy has an existing bug related to inferring the variance
+        # of TypedDicts with `builtins.enumerate`:
+        # https://github.com/python/mypy/issues/8586#issuecomment-2867698435
+        engine_prompt = cast(Union[EngineTokensPrompt, EngineEmbedsPrompt],
+                             engine_prompt)
+
+        # Return the original generator without wrapping
+        return self.engine_client.encode(
+            engine_prompt,
+            pooling_params,
+            request_id_item,
+            lora_request=ctx.lora_request,
+            trace_headers=trace_headers,
+            priority=getattr(ctx.request, "priority", 0),
+        )
+
+    @override
+    async def _prepare_generators(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        """Override to support chunked processing."""
+        ctx = cast(EmbeddingServeContext, ctx)
+
+        # Check if we should use chunked processing
+        use_chunked = self._should_use_chunked_processing(ctx.request)
+
+        # If no chunked processing needed, delegate to parent class
+        if not use_chunked:
+            return await super()._prepare_generators(ctx)
+
+        # Custom logic for chunked processing
+        generators: list[AsyncGenerator[Union[RequestOutput,
+                                              PoolingRequestOutput],
+                                        None]] = []
+
+        try:
+            trace_headers = (None if ctx.raw_request is None else await
+                             self._get_trace_headers(ctx.raw_request.headers))
+
+            pooling_params = self._create_pooling_params(ctx)
+            if isinstance(pooling_params, ErrorResponse):
+                return pooling_params
+
+            # Verify and set the task for pooling params
+            try:
+                pooling_params.verify("embed", self.model_config)
+            except ValueError as e:
+                return self.create_error_response(str(e))
+
+            if ctx.engine_prompts is None:
+                return self.create_error_response(
+                    "Engine prompts not available")
+
+            if ctx.request_prompts is None:
+                return self.create_error_response(
+                    "Request prompts not available")
+
+            max_pos_embeddings = self._get_max_position_embeddings()
+
+            for i, engine_prompt in enumerate(ctx.engine_prompts):
+                request_prompt = ctx.request_prompts[i]
+
+                # Check if this specific prompt needs chunked processing
+                if self._is_text_tokens_prompt(request_prompt):
+                    # Cast to TextTokensPrompt since we've verified
+                    # prompt_token_ids
+                    text_tokens_prompt = cast(TextTokensPrompt, request_prompt)
+                    if (len(text_tokens_prompt["prompt_token_ids"])
+                            > max_pos_embeddings):
+                        # Use chunked processing for this prompt
+                        chunk_generators = await self._process_chunked_request(
+                            ctx, text_tokens_prompt, pooling_params,
+                            trace_headers, i)
+                        generators.extend(chunk_generators)
+                        continue
+
+                # Normal processing for short prompts or non-token prompts
+                # Cast engine_prompt to the expected type for mypy
+                engine_prompt_typed = cast(
+                    Union[EngineTokensPrompt, EngineEmbedsPrompt],
+                    engine_prompt)
+                generator = await self._create_single_prompt_generator(
+                    ctx, engine_prompt_typed, request_prompt, pooling_params,
+                    trace_headers, i)
+                generators.append(generator)
+
+            from vllm.utils import merge_async_iterators
+            ctx.result_generator = merge_async_iterators(*generators)
+
+            return None
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+    @override
+    async def _collect_batch(
+        self,
+        ctx: ServeContext,
+    ) -> Optional[ErrorResponse]:
+        """Collect and aggregate batch results
+        with support for chunked processing.
+        
+        For chunked requests, performs online aggregation to 
+        minimize memory usage.
+        For regular requests, collects results normally.
+        """
+        ctx = cast(EmbeddingServeContext, ctx)
+        try:
+            if ctx.engine_prompts is None:
+                return self.create_error_response(
+                    "Engine prompts not available")
+
+            # Check if we used chunked processing
+            use_chunked = self._should_use_chunked_processing(ctx.request)
+
+            if not use_chunked:
+                return await super()._collect_batch(ctx=ctx)
+
+            if ctx.request_prompts is None:
+                return self.create_error_response(
+                    "Request prompts not available")
+
+            if ctx.result_generator is None:
+                return self.create_error_response(
+                    "Result generator not available")
+
+            # Online aggregation for chunked requests to
+            # minimize memory usage
+            # Track aggregation state for each prompt
+            prompt_aggregators: dict[int, dict[str, Any]] = {}
+            short_prompts_results: dict[int, PoolingRequestOutput] = {}
+
+            async for result_idx, result in ctx.result_generator:
+                if "-chunk-" in result.request_id:
+                    # Extract prompt_idx from chunked request_id
+                    parts = result.request_id.split("-")
+                    try:
+                        prompt_idx = int(parts[parts.index("prompt") + 1])
+                    except (ValueError, IndexError):
+                        # Fallback: extract from result_idx if parsing fails
+                        prompt_idx = result_idx
+
+                    # Initialize aggregator for this prompt if needed
+                    if prompt_idx not in prompt_aggregators:
+                        prompt_aggregators[prompt_idx] = {
+                            'weighted_sum': None,
+                            'total_weight': 0,
+                            'chunk_count': 0,
+                            'request_id': result.request_id.split("-chunk-")[0]
+                        }
+
+                    aggregator = prompt_aggregators[prompt_idx]
+
+                    # MEAN pooling with online weighted averaging
+                    # Ensure result is PoolingRequestOutput
+                    # for embedding processing
+                    if not isinstance(result, PoolingRequestOutput):
+                        return self.create_error_response(
+                            f"Expected PoolingRequestOutput for "
+                            f"chunked embedding, got "
+                            f"{type(result).__name__}")
+
+                    # Handle both PoolingOutput and
+                    # EmbeddingOutput types
+                    if hasattr(result.outputs, 'data'):
+                        # PoolingOutput case
+                        embedding_data = result.outputs.data
+                    elif hasattr(result.outputs, 'embedding'):
+                        # EmbeddingOutput case -
+                        # convert embedding list to tensor
+                        embedding_data = result.outputs.embedding
+                    else:
+                        return self.create_error_response(
+                            f"Unsupported output type: "
+                            f"{type(result.outputs).__name__}")
+
+                    if not isinstance(embedding_data, torch.Tensor):
+                        embedding_data = torch.tensor(embedding_data,
+                                                      dtype=torch.float32)
+
+                    if result.prompt_token_ids is None:
+                        return self.create_error_response(
+                            "prompt_token_ids cannot be None for "
+                            "chunked processing")
+                    weight = len(result.prompt_token_ids)
+
+                    weighted_embedding = embedding_data.to(
+                        dtype=torch.float32) * weight
+
+                    if aggregator['weighted_sum'] is None:
+                        # First chunk
+                        aggregator['weighted_sum'] = weighted_embedding
+                    else:
+                        # Accumulate
+                        aggregator['weighted_sum'] += weighted_embedding
+
+                    aggregator['total_weight'] += weight
+                    aggregator['chunk_count'] += 1
+                else:
+                    # Non-chunked result - extract prompt_idx from request_id
+                    parts = result.request_id.split("-")
+                    try:
+                        # Last part should be prompt index
+                        prompt_idx = int(parts[-1])
+                    except (ValueError, IndexError):
+                        prompt_idx = result_idx  # Fallback to result_idx
+
+                    short_prompts_results[prompt_idx] = cast(
+                        PoolingRequestOutput, result)
+
+            # Finalize aggregated results
+            final_res_batch: list[Union[PoolingRequestOutput,
+                                        EmbeddingRequestOutput]] = []
+            num_prompts = len(ctx.engine_prompts)
+
+            for prompt_idx in range(num_prompts):
+                if prompt_idx in prompt_aggregators:
+                    # Finalize MEAN aggregation for this chunked prompt
+                    aggregator = prompt_aggregators[prompt_idx]
+
+                    weighted_sum = aggregator['weighted_sum']
+                    total_weight = aggregator['total_weight']
+
+                    if (weighted_sum is not None
+                            and isinstance(weighted_sum, torch.Tensor)
+                            and isinstance(total_weight,
+                                           (int, float)) and total_weight > 0):
+
+                        # Compute final mean embedding
+                        final_embedding = weighted_sum / total_weight
+
+                        # Create a PoolingRequestOutput
+                        # for the aggregated result
+                        pooling_output_data = PoolingOutput(
+                            data=final_embedding)
+
+                        # Get original prompt token IDs for this prompt
+                        original_prompt = ctx.request_prompts[prompt_idx]
+                        if not self._is_text_tokens_prompt(original_prompt):
+                            return self.create_error_response(
+                                f"Chunked prompt {prompt_idx} is not a "
+                                f"TextTokensPrompt")
+
+                        original_token_ids = cast(
+                            TextTokensPrompt,
+                            original_prompt)["prompt_token_ids"]
+
+                        pooling_request_output = PoolingRequestOutput(
+                            request_id=aggregator['request_id'],
+                            prompt_token_ids=original_token_ids,
+                            outputs=pooling_output_data,
+                            finished=True)
+
+                        final_res_batch.append(pooling_request_output)
+                    else:
+                        return self.create_error_response(
+                            f"Failed to aggregate chunks "
+                            f"for prompt {prompt_idx}")
+                elif prompt_idx in short_prompts_results:
+                    final_res_batch.append(
+                        cast(PoolingRequestOutput,
+                             short_prompts_results[prompt_idx]))
+                else:
+                    return self.create_error_response(
+                        f"Result not found for prompt {prompt_idx}")
+
+            ctx.final_res_batch = cast(
+                list[Union[RequestOutput, PoolingRequestOutput]],
+                final_res_batch)
+
+            return None
+
+        except Exception as e:
+            return self.create_error_response(str(e))
+
 
 class OpenAIServingEmbedding(EmbeddingMixin):
     request_id_prefix = "embd"

From 1ddd5e734110de78a08890087c12f0ecf3ae3e9e Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Wed, 13 Aug 2025 20:27:25 +0800
Subject: [PATCH 034/233] [FEATURE] support custom vllm tuned config path for
 fused moe triton kernels (#22791)

Signed-off-by: Chi Zhang <zhangchi.usc1992@bytedance.com>
---
 vllm/envs.py                                  |  6 ++++
 .../layers/fused_moe/fused_moe.py             | 28 +++++++++++++------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 0b016dbc85d6..2470a891c9d7 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -159,6 +159,7 @@
     VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
+    VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
 
 
 def get_default_cache_root():
@@ -1127,6 +1128,11 @@ def get_vllm_port() -> Optional[int]:
     #    never removed from memory until the server terminates.
     "VLLM_ENABLE_RESPONSES_API_STORE":
     lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
+
+    # Allows vllm to find tuned config under customized folder
+    "VLLM_TUNED_CONFIG_FOLDER":
+    lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None),
+
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index ad094c37f947..98087a35e15c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -701,20 +701,32 @@ def get_moe_configs(
     block_shape = [block_n, block_k] if block_n and block_k else None
     json_file_name = get_config_file_name(E, N, dtype, block_shape)
 
-    config_file_path = os.path.join(
+    config_file_paths = []
+
+    # note that we prioritize user defined config
+    user_defined_config_folder = envs.VLLM_TUNED_CONFIG_FOLDER
+    if user_defined_config_folder is not None:
+        user_defined_config_file_path = os.path.join(
+            user_defined_config_folder, json_file_name)
+        config_file_paths.append(user_defined_config_file_path)
+
+    default_config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
-    if os.path.exists(config_file_path):
-        with open(config_file_path) as f:
-            logger.info("Using configuration from %s for MoE layer.",
-                        config_file_path)
-            # If a configuration has been found, return it
-            return {int(key): val for key, val in json.load(f).items()}
+    config_file_paths.append(default_config_file_path)
+
+    for config_file_path in config_file_paths:
+        if os.path.exists(config_file_path):
+            with open(config_file_path) as f:
+                logger.info("Using configuration from %s for MoE layer.",
+                            config_file_path)
+                # If a configuration has been found, return it
+                return {int(key): val for key, val in json.load(f).items()}
 
     # If no optimized configuration is available, we will use the default
     # configuration
     logger.warning(
         ("Using default MoE config. Performance might be sub-optimal! "
-         "Config file not found at %s"), config_file_path)
+         "Config file not found at %s"), config_file_paths)
     return None
 
 
From 00f1ba7a2d65ad12d5e9e154ba7308fe9fd615a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 13 Aug 2025 15:03:53 +0200
Subject: [PATCH 035/233] [Nixl][CI] Fix tests (#22806)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/kv_connector/unit/test_nixl_connector.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 3860d7c85724..b185936ab025 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -229,6 +229,9 @@ def _nixl_handshake(self, host: str, port: int, remote_tp_size: int,
                 num_blocks=1,
                 block_len=self.block_len,
                 attn_backend_name=self.backend_name,
+                # `self.kv_cache_layout` is only forced to HND when vllm engine
+                # is started. We mock HND here.
+                kv_cache_layout="HND",
             ),
             remote_tp_size=remote_tp_size)
         return {0: remote_agent_name}

From 657eac2840f908aae9f37aa50c93743febcec220 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 13 Aug 2025 06:07:09 -0700
Subject: [PATCH 036/233] [Bugfix][mamba] Fix type annotation of Mamba2Metadata
 (#22787)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 .../layers/mamba/mamba_mixer2.py              |  8 ++--
 vllm/v1/attention/backends/mamba_attn.py      | 39 +++++++++++--------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index d5f4877135c9..10a5618c227e 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -473,12 +473,12 @@ def forward_cuda(
                 conv_state = self_kv_cache[0].transpose(-1, -2)
                 ssm_state = self_kv_cache[1]
                 state_indices_tensor = attn_metadata.state_indices_tensor
-                has_initial_states_p = attn_metadata.has_initial_states
+                has_initial_states_p = attn_metadata.has_initial_states_p
                 prep_initial_states = attn_metadata.prep_initial_states
                 chunk_size = attn_metadata.chunk_size
-                seq_idx_p = attn_metadata.seq_idx
-                chunk_indices_p = attn_metadata.chunk_indices
-                chunk_offsets_p = attn_metadata.chunk_offsets
+                seq_idx_p = attn_metadata.seq_idx_p
+                chunk_indices_p = attn_metadata.chunk_indices_p
+                chunk_offsets_p = attn_metadata.chunk_offsets_p
         else:
             conv_state = mamba_cache_params.conv_state
             ssm_state = mamba_cache_params.ssm_state
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 7c1226049f69..3f84f8967db7 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -68,14 +68,19 @@ class Mamba2AttentionMetadata:
     query_start_loc: torch.Tensor
     seq_lens: torch.Tensor
 
-    has_initial_states: torch.Tensor
     prep_initial_states: bool
     chunk_size: int
-    seq_idx: torch.Tensor
-    chunk_indices: torch.Tensor
-    chunk_offsets: torch.Tensor
+
+    # The following tensors only contain prefill requests and will be None if
+    # the batch has no prefill request.
+    has_initial_states_p: Optional[torch.Tensor]
+    seq_idx_p: Optional[torch.Tensor]
+    chunk_indices_p: Optional[torch.Tensor]
+    chunk_offsets_p: Optional[torch.Tensor]
 
     state_indices_tensor: torch.Tensor  # shape: [batch,]
+
+    # The following attributes are for triton implementation of causal_conv1d
     nums_dict: Optional[dict] = None
     cu_seqlen: Optional[int] = None
     batch_ptr: Optional[torch.tensor] = None
@@ -115,11 +120,11 @@ def build(self,
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
 
-        seq_idx = None
-        chunk_indices, chunk_offsets = None, None
+        seq_idx_p = None
+        chunk_indices_p, chunk_offsets_p = None, None
         # Need flags to indicate if there are initial states
         # currently we really only support the FlashAttention backend
-        has_initial_states = None
+        has_initial_states_p = None
         prep_initial_states = False
 
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
@@ -135,25 +140,25 @@ def build(self,
                 common_attn_metadata.
                 num_computed_tokens_cpu[num_reqs - num_prefills:num_reqs] > 0)
             prep_initial_states = torch.any(has_initial_states_cpu).item()
-            has_initial_states = has_initial_states_cpu.to(
+            has_initial_states_p = has_initial_states_cpu.to(
                 query_start_loc.device)
 
             query_start_loc_p = common_attn_metadata.query_start_loc[
                 -num_prefills - 1:] - num_decode_tokens
 
-            seq_idx = torch.repeat_interleave(torch.arange(
+            seq_idx_p = torch.repeat_interleave(torch.arange(
                 num_prefills,
                 dtype=torch.int32,
                 device=query_start_loc_p.device),
-                                              query_start_loc_p.diff(),
-                                              output_size=num_prefill_tokens)
-            seq_idx.unsqueeze_(0)
+                                                query_start_loc_p.diff(),
+                                                output_size=num_prefill_tokens)
+            seq_idx_p.unsqueeze_(0)
 
             # We compute metadata for chunked prefill once at the top level
             # model forward and reuse them in mamba layers. If not needed,
             # they will be ignored inside mamba kernels.
             if prep_initial_states:
-                chunk_indices, chunk_offsets = (
+                chunk_indices_p, chunk_offsets_p = (
                     _query_start_loc_to_chunk_indices_offsets(
                         query_start_loc_p, self.chunk_size,
                         num_prefill_tokens))
@@ -173,12 +178,12 @@ def build(self,
             num_decode_tokens=num_decode_tokens,
             query_start_loc=query_start_loc,
             seq_lens=seq_lens,
-            has_initial_states=has_initial_states,
             prep_initial_states=prep_initial_states,
             chunk_size=self.chunk_size,
-            seq_idx=seq_idx,
-            chunk_indices=chunk_indices,
-            chunk_offsets=chunk_offsets,
+            has_initial_states_p=has_initial_states_p,
+            seq_idx_p=seq_idx_p,
+            chunk_indices_p=chunk_indices_p,
+            chunk_offsets_p=chunk_offsets_p,
             state_indices_tensor=state_indices_tensor,
         )
         return attn_metadata

From dea3291be24201d6b10ec7712b42adcb6bc18f42 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Wed, 13 Aug 2025 21:07:28 +0800
Subject: [PATCH 037/233] Remove unnecessary CUDA sync of qwen image and video
 preprocess (#22792)

Signed-off-by: cyy <cyyever@outlook.com>
Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 6bea180ffec9..5bcbcc4f0e37 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -976,10 +976,12 @@ def _process_image_input(
             image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
+        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
 
-        return image_embeds.split(sizes.tolist())
+        return image_embeds.split(sizes)
 
     def _process_video_input(
             self,
@@ -998,9 +1000,11 @@ def _process_video_input(
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
-        sizes = grid_thw.prod(-1) // merge_size // merge_size
+        # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
+        sizes = (torch.tensor(grid_thw_list, dtype=torch.long).prod(-1) //
+                 (merge_size * merge_size)).tolist()
 
-        return video_embeds.split(sizes.tolist())
+        return video_embeds.split(sizes)
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}

From 0b36a38445102f73c12387aadccb96ea50945183 Mon Sep 17 00:00:00 2001
From: Gh0u1L5 <Gh0u1L5@outlook.com>
Date: Wed, 13 Aug 2025 21:08:23 +0800
Subject: [PATCH 038/233] Fix GGUF loader for Qwen3 MoE. (#22785)

Signed-off-by: Gh0u1L5 <Gh0u1L5@outlook.com>
---
 vllm/model_executor/model_loader/gguf_loader.py | 11 +++++++++++
 vllm/model_executor/models/qwen3_moe.py         |  1 +
 2 files changed, 12 insertions(+)

diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 26af87c1ed67..21655b0c69bb 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -74,6 +74,17 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
                         f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
                 gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \
                         f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
+        if model_type in ("qwen2_moe", "qwen3_moe"):
+            model_type = model_type.replace("_", "")
+            # GGUF layer map assumes that we will have a merged expert weights
+            # so we need to map them manually
+            for idx in range(config.num_hidden_layers):
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
 
         arch = None
         for key, value in gguf.MODEL_ARCH_NAMES.items():
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 085fc90b47b5..61b16b6a1d2d 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -375,6 +375,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            quant_config=quant_config,
             prefix=f"{prefix}.embed_tokens")
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,

From 4112a0974598cb5613ad943d8c12e326da429246 Mon Sep 17 00:00:00 2001
From: milesial <milesial@users.noreply.github.com>
Date: Wed, 13 Aug 2025 06:09:26 -0700
Subject: [PATCH 039/233] [Frontend] Multithreaded async multimodal load_bytes
 (#22710)

Signed-off-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
Co-authored-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
---
 vllm/envs.py             |  7 +++++++
 vllm/multimodal/utils.py | 26 ++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 2470a891c9d7..5958a5cc0f29 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -63,6 +63,7 @@
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_VIDEO_FETCH_TIMEOUT: int = 30
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+    VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
     VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
     VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
@@ -556,6 +557,12 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_AUDIO_FETCH_TIMEOUT":
     lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
 
+    # Max number of workers for the thread pool handling
+    # media bytes loading. Set to 1 to disable parallel processing.
+    # Default is 8
+    "VLLM_MEDIA_LOADING_THREAD_COUNT":
+    lambda: int(os.getenv("VLLM_MEDIA_LOADING_THREAD_COUNT", "8")),
+
     # Maximum filesize in MB for a single audio file when processing
     # speech-to-text requests. Files larger than this will be rejected.
     # Default is 25 MB
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 8dfbc6503520..b8266fd350f5 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import asyncio
+import atexit
+from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
@@ -33,6 +36,10 @@
     MultiModalKwargs = Any
     MultiModalPlaceholderDict = Any
 
+global_thread_pool = ThreadPoolExecutor(
+    max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT)
+atexit.register(global_thread_pool.shutdown)
+
 
 class MediaConnector:
 
@@ -139,19 +146,26 @@ async def load_from_url_async(
         fetch_timeout: Optional[int] = None,
     ) -> _M:
         url_spec = urlparse(url)
+        loop = asyncio.get_running_loop()
 
         if url_spec.scheme.startswith("http"):
             connection = self.connection
             data = await connection.async_get_bytes(url, timeout=fetch_timeout)
-
-            return media_io.load_bytes(data)
+            future = loop.run_in_executor(global_thread_pool,
+                                          media_io.load_bytes, data)
+            return await future
 
         if url_spec.scheme == "data":
-            return self._load_data_url(url_spec, media_io)
+            future = loop.run_in_executor(global_thread_pool,
+                                          self._load_data_url, url_spec,
+                                          media_io)
+            return await future
 
         if url_spec.scheme == "file":
-            return self._load_file_url(url_spec, media_io)
-
+            future = loop.run_in_executor(global_thread_pool,
+                                          self._load_file_url, url_spec,
+                                          media_io)
+            return await future
         msg = "The URL must be either a HTTP, data or file URL."
         raise ValueError(msg)
 
@@ -489,4 +503,4 @@ def fetch_video(
         "video": video_io_kwargs
     }
     media_connector = MediaConnector(media_io_kwargs=media_io_kwargs)
-    return media_connector.fetch_video(video_url)
\ No newline at end of file
+    return media_connector.fetch_video(video_url)

From 8dd4c5a5f081693ae8562cbbb664277d9c8eff75 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 13 Aug 2025 22:18:07 +0800
Subject: [PATCH 040/233] [Core] Use individual MM items in P0/P1 cache and
 model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_utils.py               | 233 +++++++------------
 tests/v1/core/test_kv_cache_utils.py         |  48 ++--
 tests/v1/core/test_prefix_caching.py         |  31 ++-
 tests/v1/core/test_scheduler.py              |  21 +-
 tests/v1/core/utils.py                       |  19 +-
 tests/v1/engine/test_engine_core.py          |   2 +-
 tests/v1/engine/test_engine_core_client.py   |   2 +-
 tests/v1/engine/test_output_processor.py     |  10 +-
 tests/v1/kv_connector/unit/utils.py          |   2 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py |   2 +-
 tests/v1/worker/test_gpu_input_batch.py      |   2 +-
 tests/v1/worker/test_gpu_model_runner.py     |   2 +-
 vllm/multimodal/inputs.py                    | 141 +++++++++--
 vllm/multimodal/utils.py                     | 133 ++++++-----
 vllm/v1/core/sched/output.py                 |  10 +-
 vllm/v1/engine/__init__.py                   |   6 +-
 vllm/v1/engine/core.py                       |   7 +-
 vllm/v1/engine/mm_input_cache.py             |  78 +++----
 vllm/v1/engine/processor.py                  |  66 ++----
 vllm/v1/request.py                           |  21 +-
 vllm/v1/serial_utils.py                      |  48 ++--
 vllm/v1/worker/gpu_input_batch.py            |  13 +-
 vllm/v1/worker/gpu_model_runner.py           |  97 ++++----
 vllm/v1/worker/tpu_model_runner.py           |  39 ++--
 24 files changed, 548 insertions(+), 485 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 3fdf7e33ca5f..41f4773a11c8 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -5,7 +5,7 @@
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import TYPE_CHECKING, NamedTuple, Optional
+from typing import TYPE_CHECKING, NamedTuple
 
 import numpy as np
 import pytest
@@ -19,14 +19,12 @@
                                              initialize_model_parallel)
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
-from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata,
+from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions,
                                    run_dp_sharded_vision_model)
 from vllm.platforms import current_platform
 from vllm.utils import get_open_port, update_environment_variables
 
 if TYPE_CHECKING:
-    from vllm.multimodal.hasher import MultiModalHashDict
     from vllm.multimodal.inputs import MultiModalPlaceholderDict
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -178,19 +176,17 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
     assert metadata_sync == metadata_async
 
 
-# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
+# Used for `test_argsort_mm_positions`.
 class TestCase(NamedTuple):
     mm_positions: "MultiModalPlaceholderDict"
-    mm_hashes: Optional["MultiModalHashDict"]
-    expected_modalities: list[str]
-    expected_ranges: list[PlaceholderRange]
-    expected_hashes: Optional[list[str]]
+    expected_modality_idxs: list[tuple[str, int]]
 
 
-def test_merge_and_sort_multimodal_metadata():
+def test_argsort_mm_positions():
 
     test_cases = [
-        # Single modality should return result as is but flattened
+        # Single modality
+        ## Internally sorted
         TestCase(
             mm_positions={
                 "image": [
@@ -198,34 +194,27 @@ def test_merge_and_sort_multimodal_metadata():
                     PlaceholderRange(offset=3, length=2),
                 ]
             },
-            mm_hashes={"image": ["hash1", "hash2"]},
-            expected_modalities=["image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=3, length=2),
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 1),
             ],
-            expected_hashes=["hash1", "hash2"],
         ),
-
-        # Single modality without hashes return None for mm hash.
+        ## Internally unsorted
         TestCase(
             mm_positions={
                 "image": [
+                    PlaceholderRange(offset=3, length=2),
                     PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=2),
                 ]
             },
-            mm_hashes=None,
-            expected_modalities=["image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=2),
+            expected_modality_idxs=[
+                ("image", 1),
+                ("image", 0),
             ],
-            expected_hashes=None,
         ),
 
-        # Multiple modalities with hashes should return sorted modalities
-        # and flattened ranges and hashes.
+        # Two modalities
+        ## Internally sorted
         TestCase(
             mm_positions={
                 "image": [
@@ -237,47 +226,54 @@ def test_merge_and_sort_multimodal_metadata():
                     PlaceholderRange(offset=2, length=3),
                 ]
             },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1", "audio_hash2"],
-            },
-            expected_modalities=["audio", "audio", "image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=7, length=4),
-                PlaceholderRange(offset=11, length=5),
+            expected_modality_idxs=[
+                ("audio", 0),
+                ("audio", 1),
+                ("image", 0),
+                ("image", 1),
             ],
-            expected_hashes=[
-                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+        ),
+        ## Interleaved, internally sorted
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4),
+                ]
+            },
+            expected_modality_idxs=[
+                ("image", 0),
+                ("audio", 0),
+                ("image", 1),
+                ("audio", 1),
             ],
         ),
-
-        # Multiple modalities without hashes should return sorted modalities
-        # and flattened ranges and None.
+        ## Interleaved, internally unsorted
         TestCase(
             mm_positions={
                 "image": [
-                    PlaceholderRange(offset=7, length=4),
-                    PlaceholderRange(offset=11, length=5),
+                    PlaceholderRange(offset=8, length=2),
+                    PlaceholderRange(offset=0, length=4),
                 ],
                 "audio": [
-                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=11, length=4),
+                    PlaceholderRange(offset=5, length=2),
                 ]
             },
-            mm_hashes=None,
-            expected_modalities=["audio", "audio", "image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=7, length=4),
-                PlaceholderRange(offset=11, length=5),
+            expected_modality_idxs=[
+                ("image", 1),
+                ("audio", 1),
+                ("image", 0),
+                ("audio", 0),
             ],
-            expected_hashes=None,
         ),
 
         # Three modalities
+        ## Internally sorted
         TestCase(
             mm_positions={
                 "image": [
@@ -293,72 +289,16 @@ def test_merge_and_sort_multimodal_metadata():
                     PlaceholderRange(offset=12, length=6),
                 ]
             },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1"],
-                "video": ["video_hash1", "video_hash2", "video_hash3"]
-            },
-            expected_modalities=[
-                "audio", "video", "video", "video", "image", "image"
-            ],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=3, length=4),
-                PlaceholderRange(offset=7, length=5),
-                PlaceholderRange(offset=12, length=6),
-                PlaceholderRange(offset=15, length=7),
-                PlaceholderRange(offset=22, length=8),
-            ],
-            expected_hashes=[
-                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
-                "image_hash1", "image_hash2"
-            ],
-        ),
-    ]
-
-    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-         expected_hashes) in test_cases:
-        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-            mm_positions, mm_hashes)
-
-        assert modalities == expected_modalities
-        assert ranges == expected_ranges
-        assert hashes == expected_hashes
-
-
-def test_merge_and_sort_multimodal_metadata_with_interleaving():
-
-    test_cases = [
-
-        # <image> <audio> <image> <audio>
-        TestCase(
-            mm_positions={
-                "image": [
-                    PlaceholderRange(offset=0, length=4),
-                    PlaceholderRange(offset=8, length=2),
-                ],
-                "audio": [
-                    PlaceholderRange(offset=5, length=2),
-                    PlaceholderRange(offset=11, length=4),
-                ]
-            },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1", "audio_hash2"],
-            },
-            expected_modalities=["image", "audio", "image", "audio"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=4),
-                PlaceholderRange(offset=5, length=2),
-                PlaceholderRange(offset=8, length=2),
-                PlaceholderRange(offset=11, length=4),
-            ],
-            expected_hashes=[
-                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
+            expected_modality_idxs=[
+                ("audio", 0),
+                ("video", 0),
+                ("video", 1),
+                ("video", 2),
+                ("image", 0),
+                ("image", 1),
             ],
         ),
-
-        # <image> <image> <audio> <video> <image>
+        ## Interleaved, internally sorted
         TestCase(
             mm_positions={
                 "image": [
@@ -373,58 +313,43 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                     PlaceholderRange(offset=8, length=5),
                 ]
             },
-            mm_hashes=None,
-            expected_modalities=["image", "image", "audio", "video", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=5, length=2),
-                PlaceholderRange(offset=8, length=5),
-                PlaceholderRange(offset=20, length=4),
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 1),
+                ("audio", 0),
+                ("video", 0),
+                ("image", 2),
             ],
-            expected_hashes=None,
         ),
-
-        # <image> <audio> <video> <image> with hashes
+        ## Interleaved, internally sunorted
         TestCase(
             mm_positions={
                 "image": [
                     PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=18, length=4),
+                    PlaceholderRange(offset=20, length=4),
+                    PlaceholderRange(offset=2, length=3),
                 ],
                 "audio": [
-                    PlaceholderRange(offset=6, length=2),
+                    PlaceholderRange(offset=5, length=2),
                 ],
                 "video": [
-                    PlaceholderRange(offset=10, length=5),
+                    PlaceholderRange(offset=8, length=5),
                 ]
             },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1"],
-                "video": ["video_hash1"],
-            },
-            expected_modalities=["image", "audio", "video", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=6, length=2),
-                PlaceholderRange(offset=10, length=5),
-                PlaceholderRange(offset=18, length=4),
-            ],
-            expected_hashes=[
-                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 2),
+                ("audio", 0),
+                ("video", 0),
+                ("image", 1),
             ],
         ),
     ]
 
-    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-         expected_hashes) in test_cases:
-        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-            mm_positions, mm_hashes)
+    for mm_positions, expected_modality_idxs in test_cases:
+        modality_idxs = argsort_mm_positions(mm_positions)
 
-        assert modalities == expected_modalities
-        assert ranges == expected_ranges
-        assert hashes == expected_hashes
+        assert modality_idxs == expected_modality_idxs
 
 
 class SimpleLinearModel(torch.nn.Module):
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index bff3724d95e6..182ea2b2345c 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
+from typing import Optional
 
 import pytest
 import torch
 
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -27,20 +30,29 @@
 # yapf: enable
 
 
-def make_request(request_id,
-                 prompt_token_ids,
-                 mm_positions=None,
-                 mm_hashes=None,
-                 cache_salt=None):
+def make_request(
+    request_id: str,
+    prompt_token_ids: list[int],
+    mm_positions: Optional[list[PlaceholderRange]] = None,
+    mm_hashes: Optional[list[str]] = None,
+    cache_salt: Optional[str] = None,
+):
     if mm_positions is None:
-        multi_modal_inputs = None
+        mm_kwargs = None
     else:
-        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+        mm_elem = MultiModalFieldElem(
+            modality="dummy_m",
+            key="dummy_k",
+            data=None,
+            field=MultiModalBatchedField(),
+        )
+        mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+        mm_kwargs = [mm_item] * len(mm_positions)
 
     return Request(
         request_id=request_id,
         prompt_token_ids=prompt_token_ids,
-        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_kwargs=mm_kwargs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
@@ -316,7 +328,7 @@ def test_free_kv_cache_block_queue_get_all_free_blocks():
 
 def test_generate_block_hash_extra_keys():
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(20)],
         mm_positions=[
             PlaceholderRange(offset=0, length=5),
@@ -348,7 +360,7 @@ def test_generate_block_hash_extra_keys():
 
 def test_generate_block_hash_extra_keys_no_mm_inputs():
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=None,
         mm_hashes=None,
@@ -361,7 +373,7 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
 
 def test_generate_block_hash_extra_keys_cache_salt():
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=None,
         mm_hashes=None,
@@ -382,7 +394,7 @@ def test_generate_block_hash_extra_keys_cache_salt():
 
     # works together with other extra keys
     request_mm = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(20)],
         mm_positions=[
             PlaceholderRange(offset=0, length=5),
@@ -420,7 +432,7 @@ def test_hash_request_tokens(hash_fn):
     import vllm.v1.core.kv_cache_utils
     init_none_hash(hash_fn)
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
@@ -450,7 +462,7 @@ def test_hash_tokens_different_mm_input(hash_fn):
     init_none_hash(hash_fn)
 
     request1 = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
@@ -459,7 +471,7 @@ def test_hash_tokens_different_mm_input(hash_fn):
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
-        request_id=1,
+        request_id="1",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
@@ -479,7 +491,7 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
     init_none_hash(hash_fn)
 
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
         mm_positions=None,
         mm_hashes=None,
@@ -844,7 +856,7 @@ def test_allocate_with_lookahead():
     )
 
     request = make_request(
-        request_id=0,
+        request_id="0",
         prompt_token_ids=[],
         mm_positions=None,
         mm_hashes=None,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 085616303d85..87acdef22013 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -9,7 +9,9 @@
 import torch
 
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
@@ -21,21 +23,30 @@
                                         KVCacheGroupSpec, SlidingWindowSpec)
 
 
-def make_request(request_id,
-                 prompt_token_ids,
-                 mm_positions=None,
-                 mm_hashes=None,
-                 prompt_logprobs: Optional[int] = None,
-                 cache_salt: Optional[str] = None):
+def make_request(
+    request_id: str,
+    prompt_token_ids: list[int],
+    mm_positions: Optional[list[PlaceholderRange]] = None,
+    mm_hashes: Optional[list[str]] = None,
+    prompt_logprobs: Optional[int] = None,
+    cache_salt: Optional[str] = None,
+):
     if mm_positions is None:
-        multi_modal_inputs = None
+        mm_kwargs = None
     else:
-        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+        mm_elem = MultiModalFieldElem(
+            modality="dummy_m",
+            key="dummy_k",
+            data=None,
+            field=MultiModalBatchedField(),
+        )
+        mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+        mm_kwargs = [mm_item] * len(mm_positions)
 
     return Request(
         request_id=request_id,
         prompt_token_ids=prompt_token_ids,
-        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_kwargs=mm_kwargs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c719d1975bba..1c7dd0ca90b7 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -8,7 +8,9 @@
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -1304,7 +1306,7 @@ def create_requests_with_priority(
         priorities: list[int],
         arrival_times: Optional[list[float]] = None,
         num_tokens: int = 10,
-        mm_positions: Optional[list[PlaceholderRange]] = None,
+        mm_positions: Optional[list[list[PlaceholderRange]]] = None,
         max_tokens: int = 16,
         stop_token_ids: Optional[list[int]] = None,
         prompt_logprobs: Optional[int] = None):
@@ -1323,16 +1325,23 @@ def create_requests_with_priority(
     for i in range(num_requests):
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+            mm_elem = MultiModalFieldElem(
+                modality="dummy_m",
+                key="dummy_k",
+                data=None,
+                field=MultiModalBatchedField(),
+            )
+            mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+            mm_kwargs = [mm_item] * len(mm_position)
         else:
             mm_position = None
-            mm_inputs = None
+            mm_kwargs = None
         request = Request(
             request_id=f"{i}",
             prompt_token_ids=[i] * num_tokens,
             sampling_params=sampling_params,
             pooling_params=None,
-            multi_modal_inputs=mm_inputs,
+            multi_modal_kwargs=mm_kwargs,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
             eos_token_id=EOS_TOKEN_ID,
@@ -1816,7 +1825,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     request = Request(
         request_id="0",
         prompt_token_ids=[0, 1],
-        multi_modal_inputs=None,
+        multi_modal_kwargs=None,
         multi_modal_hashes=None,
         multi_modal_placeholders=None,
         sampling_params=sampling_params,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 02ca4498db19..484afe61fc3f 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -6,7 +6,9 @@
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalBatchedField,
+                                    MultiModalFieldElem, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.async_scheduler import AsyncScheduler
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -115,7 +117,7 @@ def create_scheduler(
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
-    mm_positions: Optional[list[PlaceholderRange]] = None,
+    mm_positions: Optional[list[list[PlaceholderRange]]] = None,
     max_tokens: int = 16,
     stop_token_ids: Optional[list[int]] = None,
     prompt_logprobs: Optional[int] = None,
@@ -129,10 +131,17 @@ def create_requests(
     for i in range(num_requests):
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+            mm_elem = MultiModalFieldElem(
+                modality="dummy_m",
+                key="dummy_k",
+                data=None,
+                field=MultiModalBatchedField(),
+            )
+            mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+            mm_kwargs = [mm_item] * len(mm_position)
         else:
             mm_position = None
-            mm_inputs = None
+            mm_kwargs = None
         prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
                             num_tokens)
         request = Request(
@@ -140,7 +149,7 @@ def create_requests(
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
-            multi_modal_inputs=mm_inputs,
+            multi_modal_kwargs=mm_kwargs,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
             eos_token_id=EOS_TOKEN_ID,
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index c52b98967126..2ea957a3e230 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -35,7 +35,7 @@ def make_request() -> EngineCoreRequest:
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=PROMPT_TOKENS,
-        mm_inputs=None,
+        mm_kwargs=None,
         mm_hashes=None,
         mm_placeholders=None,
         sampling_params=SamplingParams(),
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 1329ce5f69cb..c82285639aee 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -52,7 +52,7 @@ def make_request(
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=prompt_tokens_ids,
-        mm_inputs=None,
+        mm_kwargs=None,
         mm_hashes=None,
         mm_placeholders=None,
         sampling_params=params,
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 949ab764e2e9..c113439a7022 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -53,7 +53,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
         EngineCoreRequest(request_id=f"request-{idx}",
                           prompt_token_ids=prompt_tokens,
                           arrival_time=0,
-                          mm_inputs=None,
+                          mm_kwargs=None,
                           mm_hashes=None,
                           mm_placeholders=None,
                           eos_token_id=None,
@@ -402,7 +402,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
         EngineCoreRequest(request_id=request_id_list[idx],
                           prompt_token_ids=prompt_tokens,
                           arrival_time=0,
-                          mm_inputs=None,
+                          mm_kwargs=None,
                           mm_hashes=None,
                           mm_placeholders=None,
                           eos_token_id=None,
@@ -567,7 +567,7 @@ def test_stop_token(include_stop_str_in_output: bool,
         request_id=request_id,
         prompt_token_ids=prompt_tokens,
         arrival_time=0,
-        mm_inputs=None,
+        mm_kwargs=None,
         mm_hashes=None,
         mm_placeholders=None,
         eos_token_id=eos_token_id,
@@ -666,7 +666,7 @@ def test_stop_string(include_stop_str_in_output: bool,
             request_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
             arrival_time=0,
-            mm_inputs=None,
+            mm_kwargs=None,
             mm_hashes=None,
             mm_placeholders=None,
             eos_token_id=None,
@@ -782,7 +782,7 @@ def test_iteration_stats(dummy_test_vectors):
             request_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
             arrival_time=0,
-            mm_inputs=None,
+            mm_kwargs=None,
             mm_hashes=None,
             mm_placeholders=None,
             eos_token_id=None,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index c22d5b861e3f..60847c48585c 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -154,7 +154,7 @@ def create_request(
         prompt_token_ids=prompt_token_ids,
         sampling_params=sampling_params,
         pooling_params=None,
-        multi_modal_inputs=None,
+        multi_modal_kwargs=None,
         multi_modal_placeholders=None,
         multi_modal_hashes=None,
         eos_token_id=EOS_TOKEN_ID,
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 215be09bf5a2..5a05781a03f2 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -64,7 +64,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
             NewRequestData(
                 req_id=req_id,
                 prompt_token_ids=[1, 2, 3],
-                mm_inputs=[],
+                mm_kwargs=[],
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 943a13debada..74ab19a3ce32 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -203,7 +203,7 @@ def _construct_cached_request_state(req_id_suffix: int):
         prompt_token_ids=prompt_token_ids,
         sampling_params=_create_sampling_params(),
         pooling_params=None,
-        mm_inputs=[],
+        mm_kwargs=[],
         mm_positions=[],
         block_ids=([], ),
         generator=None,
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index e151d388c293..e97cdf482710 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -120,7 +120,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
             NewRequestData(
                 req_id=req_id,
                 prompt_token_ids=[1, 2, 3],
-                mm_inputs=[],
+                mm_kwargs=[],
                 mm_hashes=[],
                 mm_positions=[],
                 sampling_params=SamplingParams(),
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6d4bcef3206c..0bbac45c121b 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from functools import partial
 from itertools import accumulate
 from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
@@ -198,7 +198,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 """
 
 
-@dataclass(frozen=True)
+@dataclass
 class MultiModalFieldElem:
     """
     Represents a keyword argument corresponding to a multi-modal item
@@ -218,11 +218,14 @@ class MultiModalFieldElem:
     i.e. the name of the keyword argument to be passed to the model.
     """
 
-    data: NestedTensors
+    data: Optional[NestedTensors]
     """
     The tensor data of this field in
     [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
     i.e. the value of the keyword argument to be passed to the model.
+
+    It may be set to `None` if it is determined that the item is cached
+    in `EngineCore`.
     """
 
     field: "BaseMultiModalField"
@@ -235,8 +238,15 @@ def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
 
+        if self.data is None:
+            data_equal = other.data is None
+        elif other.data is None:
+            data_equal = self.data is None
+        else:
+            data_equal = nested_tensors_equal(self.data, other.data)
+
         return ((self.modality, self.key) == (other.modality, other.key)
-                and nested_tensors_equal(self.data, other.data)
+                and data_equal
                 and type(self.field) == type(other.field))  # noqa: E721
 
 
@@ -280,10 +290,20 @@ def build_elems(
         raise NotImplementedError
 
     @abstractmethod
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def _reduce_data(
+        self,
+        batch: list[NestedTensors],
+        *,
+        pin_memory: bool,
+    ) -> NestedTensors:
         raise NotImplementedError
 
-    def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
+    def reduce_data(
+        self,
+        elems: list[MultiModalFieldElem],
+        *,
+        pin_memory: bool = False,
+    ) -> NestedTensors:
         """
         Merge the data from multiple instances of
         [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].
@@ -295,7 +315,13 @@ def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
         if len(set(field_types)) > 1:
             raise ValueError(f"Cannot merge different {field_types=}")
 
-        return self._reduce_data([item.data for item in elems])
+        validated_data = list[NestedTensors]()
+        for i, elem in enumerate(elems):
+            assert elem.data is not None, (
+                f"Cannot merge with empty `elems[{i}]`")
+            validated_data.append(elem.data)
+
+        return self._reduce_data(validated_data, pin_memory=pin_memory)
 
 
 @dataclass(frozen=True)
@@ -314,7 +340,12 @@ def build_elems(
         field_factory = self._field_factory(modality=modality, key=key)
         return [field_factory(item) for item in data]
 
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def _reduce_data(
+        self,
+        batch: list[NestedTensors],
+        *,
+        pin_memory: bool,
+    ) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
             if len(batch) == 1:
                 # An optimization when `batch` contains only one tensor:
@@ -323,7 +354,11 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
                 return batch[0].unsqueeze(0).contiguous()
             first_shape = batch[0].shape
             if all(elem.shape == first_shape for elem in batch):
-                return torch.stack(batch)
+                out = torch.empty((len(batch), *batch[0].shape),
+                                  dtype=batch[0].dtype,
+                                  device=batch[0].device,
+                                  pin_memory=pin_memory)
+                return torch.stack(batch, out=out)
 
         return batch
 
@@ -350,7 +385,12 @@ def build_elems(
                 "torch.Tensor is required for multiple slices"
         return [field_factory(data[cast(slice, s)]) for s in self.slices]
 
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def _reduce_data(
+        self,
+        batch: list[NestedTensors],
+        *,
+        pin_memory: bool,
+    ) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
             if len(batch) == 1:
                 # An optimization when `batch` contains only one tensor:
@@ -358,13 +398,21 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
                 # - will achieve zero-copy if the tensor is contiguous
                 return batch[0].contiguous()
 
-            def _expect_same_shape(tensor: torch.Tensor):
-                return tensor.shape[:self.dim] + tensor.shape[self.dim + 1:]
+            dim = self.dim + (self.dim < 0) * len(batch[0].shape)
+
+            def _shape_before_after(tensor: torch.Tensor):
+                return tensor.shape[:dim], tensor.shape[dim + 1:]
 
-            first_shape = _expect_same_shape(batch[0])
+            first_shape = _shape_before_after(batch[0])
 
-            if all(_expect_same_shape(elem) == first_shape for elem in batch):
-                return torch.concat(batch, dim=self.dim)
+            if all(_shape_before_after(elem) == first_shape for elem in batch):
+                shape_before, shape_after = first_shape
+                shape_concat = sum(item.shape[dim] for item in batch)
+                out = torch.empty((*shape_before, shape_concat, *shape_after),
+                                  dtype=batch[0].dtype,
+                                  device=batch[0].device,
+                                  pin_memory=pin_memory)
+                return torch.concat(batch, dim=self.dim, out=out)
 
         assert self.dim == 0, "dim == 0 is required for nested list"
         return [e for elem in batch for e in elem]
@@ -387,7 +435,12 @@ def build_elems(
         field_factory = self._field_factory(modality=modality, key=key)
         return [field_factory(data)] * self.batch_size
 
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def _reduce_data(
+        self,
+        batch: list[NestedTensors],
+        *,
+        pin_memory: bool,
+    ) -> NestedTensors:
         return batch[0]
 
 
@@ -594,11 +647,53 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     def from_elems(elems: Sequence[MultiModalFieldElem]):
         return MultiModalKwargsItem({elem.key: elem for elem in elems})
 
-    @property
-    def modality(self) -> str:
+    def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None:
+        super().__init__(data)
+
         modalities = {elem.modality for elem in self.data.values()}
         assert len(modalities) == 1, f"Found different modalities={modalities}"
-        return next(iter(modalities))
+        self._modality = next(iter(modalities))
+
+        self._is_empty = any(elem.data is None for elem in self.values())
+
+    @property
+    def modality(self) -> str:
+        return self._modality
+
+    @property
+    def is_empty(self) -> bool:
+        return self._is_empty
+
+    def get_data(self) -> Optional[Mapping[str, NestedTensors]]:
+        if self._is_empty:
+            return None
+
+        out_data = dict[str, NestedTensors]()
+        for key, elem in self.items():
+            assert elem.data is not None, (
+                f"Cannot get data of empty `elem[{key!r}]`")
+            out_data[key] = elem.data
+
+        return out_data
+
+    def require_data(self) -> Mapping[str, NestedTensors]:
+        if (data := self.get_data()) is None:
+            raise RuntimeError("Cannot get data of empty item")
+
+        return data
+
+    # These methods create a new item to avoid mutating cached items in place
+    def with_data(self, data: Mapping[str, NestedTensors]):
+        return MultiModalKwargsItem({
+            key: replace(elem, data=data[key])
+            for key, elem in self.items()
+        })
+
+    def without_data(self):
+        return MultiModalKwargsItem({
+            key: replace(elem, data=None)
+            for key, elem in self.items()
+        })
 
 
 # NOTE: UserDict is for V0 compatibility.
@@ -650,7 +745,11 @@ def from_hf_inputs(
         return MultiModalKwargs.from_items(items)
 
     @staticmethod
-    def from_items(items: Sequence[MultiModalKwargsItem]):
+    def from_items(
+        items: Sequence[MultiModalKwargsItem],
+        *,
+        pin_memory: bool = False,
+    ):
         """Construct a new
         [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]
         from multiple items."""
@@ -660,7 +759,7 @@ def from_items(items: Sequence[MultiModalKwargsItem]):
                 elems_by_key[key].append(elem)
 
         data = {
-            key: elems[0].field.reduce_data(elems)
+            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
             for key, elems in elems_by_key.items() if len(elems) > 0
         }
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index b8266fd350f5..3b01ee7ad4a4 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,6 +3,7 @@
 
 import asyncio
 import atexit
+from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
@@ -13,6 +14,7 @@
 import numpy.typing as npt
 import torch
 from PIL import Image, UnidentifiedImageError
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
@@ -23,17 +25,17 @@
 from .audio import AudioMediaIO
 from .base import MediaIO
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
-from .inputs import PlaceholderRange
 from .video import VideoMediaIO
 
 _M = TypeVar("_M")
 
 if TYPE_CHECKING:
-    from .hasher import MultiModalHashDict
-    from .inputs import MultiModalKwargs, MultiModalPlaceholderDict
+    from .inputs import (BatchedTensorInputs, MultiModalKwargs,
+                         MultiModalKwargsItem, MultiModalPlaceholderDict)
 else:
-    MultiModalHashDict = Any
+    BatchedTensorInputs = Any
     MultiModalKwargs = Any
+    MultiModalKwargsItem = Any
     MultiModalPlaceholderDict = Any
 
 global_thread_pool = ThreadPoolExecutor(
@@ -331,79 +333,32 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-def merge_and_sort_multimodal_metadata(
-    mm_positions: MultiModalPlaceholderDict,
-    mm_hashes: Optional[MultiModalHashDict],
-) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
-    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
-    objects from all available modalities into a single list of 
-    PlaceholderRange, sorted by their offset (starting index in the input
-    sequence) in the ascending order.
+def argsort_mm_positions(
+        mm_positions: MultiModalPlaceholderDict) -> list[tuple[str, int]]:
+    """
+    Given a `MultiModalPlaceholderDict`, output a sequence of keys to
+    sort the dictionary by `offset` (starting index in the input sequence)
+    in ascending order.
 
-    Optionally if a `MultiModalHashDict` is given, same operation will be
-    applied to the object and the sorted list of hashes will be returned.
-    
     Returns:
-        list[str]: List of item modalities in order of their positions in the
-        input sequence.
-        list[PlaceholderRange]: Sorted list of all PlaceholderRanges from
-        mm_positions.
-        Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
-        None otherwise.
+        A list of `(modality, idx)`, which can be used to access an item
+        by `mm_positions[modality][idx]`.
     """
+    flat_items = ((modality, idx, item)
+                  for modality, items in mm_positions.items()
+                  for idx, item in enumerate(items))
 
-    modalities = list(mm_positions.keys())
-
-    assert len(modalities) > 0, "No modalities found in the mm_positions."
-
-    # For single modality, placeholder ranges and hashes are already sorted
-    # so we can return the list directly.
-    if len(modalities) == 1:
-        modality = modalities[0]
-        placeholder_list = list(mm_positions[modality])
-
-        return [modality] * len(
-            placeholder_list
-        ), placeholder_list, None if not mm_hashes else mm_hashes[modality]
-
-    # Create a list of (modality, placeholder, hash) tuples for all placeholders
-    all_items = []
-    for modality in modalities:
-        placeholder_list = list(mm_positions[modality])
-        hash_list: list[Optional[str]] = list(
-            mm_hashes[modality]) if mm_hashes and modality in mm_hashes else [
-                None
-            ] * len(placeholder_list)
-
-        for placeholder, hash_value in zip(placeholder_list, hash_list):
-            all_items.append((modality, placeholder, hash_value))
+    sorted_flat_items = sorted(flat_items, key=lambda x: x[2].offset)
 
-    # Sort all items by offset
-    all_items.sort(key=lambda x: x[1].offset)
-
-    # Split into separate lists
-    sorted_modalities = [item[0] for item in all_items]
-    merged_placeholders = [item[1] for item in all_items]
-    merged_hashes = [str(item[2])
-                     for item in all_items] if mm_hashes is not None else None
-
-    return sorted_modalities, merged_placeholders, merged_hashes
+    return [(modality, idx) for modality, idx, _ in sorted_flat_items]
 
 
+# Temporary back-compatibility for plugins that define model runner
+@deprecated("`group_mm_inputs_by_modality` is superseded by "
+            "`group_mm_kwargs_by_modality` and will be removed in v0.13. "
+            "Please use `group_mm_kwargs_by_modality` instead.")
 def group_mm_inputs_by_modality(
         mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
-    """Group consecutive MultiModalKwargs from mm_inputs with the same modality
-    together into the same list for batching purpose. For MultiModalKwargs with
-    multiple modalities, put them into their own list.
-
-    Args:
-        mm_inputs: List of MultiModalKwargs.
-
-    Returns:
-        list[list[vllm.multimodal.MultiModalKwargs]]: List of list of
-        `MultiModalKwargs`, each inner list contains consecutive
-        `MultiModalKwargs` with same modality.
-    """
     if not mm_inputs:
         return []
 
@@ -426,6 +381,48 @@ def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
     ]
 
 
+def group_mm_kwargs_by_modality(
+    mm_kwargs: list[MultiModalKwargsItem],
+    *,
+    device: torch.types.Device = None,
+    pin_memory: bool = False,
+) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
+    """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
+    modality together into the same `MultiModalKwargs` instance.
+
+    Args:
+        mm_inputs: List of `MultiModalKwargsItem`.
+
+    Yields:
+        A tuple `(modality, num_items, grouped_kwargs)`.
+    """
+    from vllm.multimodal.inputs import MultiModalKwargs
+
+    for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
+        items_lst = list(items)
+
+        # mm_kwargs_group = MultiModalKwargs.from_items(items_lst,
+        #                                               pin_memory=pin_memory)
+
+        # if device is not None:
+        #     mm_kwargs_group = json_map_leaves(lambda x: x.to(device=device),
+        #                                       mm_kwargs_group.data)
+
+        # TODO: Once V0 is removed, we can use the merging logic above
+        # to avoid creating an extra batch dimension (except for fields
+        # that are meant to be stacked anyway).
+        # We will also need to update each model to remove `flatten_bn`.
+        mm_kwargs_group = MultiModalKwargs.as_kwargs(
+            MultiModalKwargs.batch(
+                [MultiModalKwargs.from_items([item]) for item in items_lst],
+                pin_memory=pin_memory,
+            ),
+            device=device,
+        )
+
+        yield modality, len(items_lst), mm_kwargs_group
+
+
 def run_dp_sharded_vision_model(image_input: torch.Tensor,
                                 vision_model: torch.nn.Module) -> torch.Tensor:
     """Run a vision model with data parallelism (DP) sharding. The function 
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index d34f39327805..fac07f97195b 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -13,7 +13,7 @@
     from vllm.distributed.kv_transfer.kv_connector.v1.base import (
         KVConnectorMetadata)
     from vllm.lora.request import LoRARequest
-    from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+    from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
     from vllm.pooling_params import PoolingParams
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
@@ -24,7 +24,7 @@ class NewRequestData:
 
     req_id: str
     prompt_token_ids: list[int]
-    mm_inputs: list[MultiModalKwargs]
+    mm_kwargs: list[MultiModalKwargsItem]
     mm_hashes: list[str]
     mm_positions: list[PlaceholderRange]
     sampling_params: Optional[SamplingParams]
@@ -42,7 +42,7 @@ def from_request(
         return cls(
             req_id=request.request_id,
             prompt_token_ids=request.prompt_token_ids,
-            mm_inputs=request.mm_inputs,
+            mm_kwargs=request.mm_kwargs,
             mm_hashes=request.mm_hashes,
             mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
@@ -56,7 +56,7 @@ def __repr__(self):
         return (f"NewRequestData("
                 f"req_id={self.req_id},"
                 f"prompt_token_ids={self.prompt_token_ids},"
-                f"mm_inputs={self.mm_inputs},"
+                f"mm_kwargs={self.mm_kwargs},"
                 f"mm_hashes={self.mm_hashes},"
                 f"mm_positions={self.mm_positions},"
                 f"sampling_params={self.sampling_params},"
@@ -70,7 +70,7 @@ def anon_repr(self):
         return (f"NewRequestData("
                 f"req_id={self.req_id},"
                 f"prompt_token_ids_len={len(self.prompt_token_ids)},"
-                f"mm_inputs={self.mm_inputs},"
+                f"mm_kwargs={self.mm_kwargs},"
                 f"mm_hashes={self.mm_hashes},"
                 f"mm_positions={self.mm_positions},"
                 f"sampling_params={self.sampling_params},"
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 810d03f32d72..b29394f3e676 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,15 +3,13 @@
 
 import enum
 import time
-from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import msgspec
 import torch
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
@@ -49,7 +47,7 @@ class EngineCoreRequest(
 
     request_id: str
     prompt_token_ids: list[int]
-    mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]]
+    mm_kwargs: Optional[list[MultiModalKwargsItem]]
     mm_hashes: Optional[list[str]]
     mm_placeholders: Optional[list[PlaceholderRange]]
     sampling_params: Optional[SamplingParams]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f92a3e43da1f..ed426f8ff452 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -409,12 +409,13 @@ def preprocess_add_request(
         request initialization running in parallel with Model forward
         """
         if request.mm_hashes is not None:
-            assert request.mm_inputs is not None
+            assert request.mm_kwargs is not None
+
             # Note on thread safety: no race condition.
             # `mm_input_cache_server` is reset at the end of LLMEngine init,
             # and will only accessed in the input processing thread afterwards.
-            request.mm_inputs = self.mm_input_cache_server.get_and_update(
-                request.mm_inputs, request.mm_hashes)
+            request.mm_kwargs = self.mm_input_cache_server.get_and_update(
+                request.mm_kwargs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
         if req.use_structured_output:
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 0532cda03d9a..1fed74330f0e 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
+from collections.abc import Mapping
+from typing import TYPE_CHECKING
 
-from vllm.multimodal import MultiModalKwargs, MultiModalRegistry
+from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.utils import is_list_of
+from vllm.multimodal.inputs import MultiModalKwargsItem, NestedTensors
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -17,23 +17,23 @@
 # -- P0:
 #  - BaseMultiModalProcessor calls MultiModalHasher to get the `mm_hash` of
 #    each input multi-modal item (e.g. image),
-#  - BaseMultiModalProcessor processes the input items into `mm_inputs`,
+#  - BaseMultiModalProcessor processes the input items into `mm_kwargs`,
 #    which are MultiModalKwargsItem instances that each correspond to an
 #    input multi-modal item.
-#  - MultiModalInputCacheClient accepts the `mm_inputs` and corresponding
+#  - MultiModalInputCacheClient accepts the `mm_kwargs` and corresponding
 #    `mm_hash` for each item. It stores the `mm_hash` as keys and the size
-#    of `mm_inputs`, but not the `mm_inputs` themselves, to avoid taking
+#    of `mm_kwargs`, but not the `mm_kwargs` themselves, to avoid taking
 #    up additional memory in P0.
 #  - The `mm_hash` is always sent to P1.
-#  - The corresponding `mm_inputs` are only sent to P1 if they are not cached
+#  - The corresponding `mm_kwargs` are only sent to P1 if they are not cached
 #    in MultiModalInputCacheServer.
 #
 # -- P1:
-#  - If the `mm_hash` is cached (i.e. `mm_inputs` are not sent from P0),
-#    MultiModalInputCacheServer retrieves the corresponding `mm_inputs`.
-#  - If the `mm_hash` is not cached (i.e. `mm_inputs` are sent from P0),
-#    MultiModalInputCacheServer stores `mm_inputs` under the key `mm_hash`.
-#  - Either way, the `mm_hash` and corresponding `mm_inputs` are sent to
+#  - If the `mm_hash` is cached (i.e. `mm_kwargs` are not sent from P0),
+#    MultiModalInputCacheServer retrieves the corresponding `mm_kwargs`.
+#  - If the `mm_hash` is not cached (i.e. `mm_kwargs` are sent from P0),
+#    MultiModalInputCacheServer stores `mm_kwargs` under the key `mm_hash`.
+#  - Either way, the `mm_hash` and corresponding `mm_kwargs` are sent to
 #    the engine for model execution.
 #
 # Both Client and Server must perform cache update and eviction based on the
@@ -58,26 +58,24 @@ def __init__(self, model_config: "ModelConfig",
 
     def get_and_update(
         self,
-        mm_inputs: Sequence[MultiModalKwargs],
+        mm_kwargs: list[MultiModalKwargsItem],
         mm_hashes: list[str],
-    ) -> Sequence[Optional[MultiModalKwargs]]:
-        assert len(mm_inputs) == len(mm_hashes)
-
+    ) -> list[MultiModalKwargsItem]:
         if not self.enabled:
-            assert is_list_of(mm_inputs, MultiModalKwargs)
-            return mm_inputs
+            return mm_kwargs
+
+        assert len(mm_kwargs) == len(mm_hashes)
 
-        full_mm_inputs = list[Optional[MultiModalKwargs]]()
-        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
+        out_mm_items = list[MultiModalKwargsItem]()
+        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
             if self.mm_cache.get(mm_hash) is not None:
-                mm_input = None
+                out_mm_items.append(mm_item.without_data())
             else:
                 self.mm_cache[mm_hash] = \
-                    MultiModalCacheItemMetadata.wraps(mm_input)
-
-            full_mm_inputs.append(mm_input)
+                    MultiModalCacheItemMetadata.wraps(mm_item.require_data())
+                out_mm_items.append(mm_item)
 
-        return full_mm_inputs
+        return out_mm_items
 
     def reset(self) -> None:
         self.mm_cache.clear()
@@ -93,30 +91,28 @@ def __init__(self, model_config: "ModelConfig",
         self.enabled = mm_registry.enable_mm_input_cache(model_config)
         self.mm_cache = MultiModalCache.get_lru_cache(
             model_config.get_mm_input_cache_gb(),
-            MultiModalKwargs,
+            Mapping[str, NestedTensors],
         )
 
     def get_and_update(
         self,
-        mm_inputs: Sequence[Optional[MultiModalKwargs]],
+        mm_kwargs: list[MultiModalKwargsItem],
         mm_hashes: list[str],
-    ) -> Sequence[MultiModalKwargs]:
-        assert len(mm_inputs) == len(mm_hashes)
-
+    ) -> list[MultiModalKwargsItem]:
         if not self.enabled:
-            assert is_list_of(mm_inputs, MultiModalKwargs)
-            return mm_inputs
+            return mm_kwargs
 
-        full_mm_inputs = list[MultiModalKwargs]()
-        for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
-            if mm_input is None:
-                mm_input = self.mm_cache[mm_hash]
-            else:
-                self.mm_cache[mm_hash] = mm_input
+        assert len(mm_kwargs) == len(mm_hashes)
 
-            full_mm_inputs.append(mm_input)
+        out_mm_items = list[MultiModalKwargsItem]()
+        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
+            if (mm_data := mm_item.get_data()) is None:
+                out_mm_items.append(mm_item.with_data(self.mm_cache[mm_hash]))
+            else:
+                self.mm_cache[mm_hash] = mm_data
+                out_mm_items.append(mm_item)
 
-        return full_mm_inputs
+        return out_mm_items
 
     def reset(self) -> None:
         self.mm_cache.clear()
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index b9419142caf6..376c76a7e728 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import time
-from collections.abc import Mapping, Sequence
+from collections.abc import Mapping
 from typing import Any, Literal, Optional, Union
 
 from vllm.config import VllmConfig
@@ -10,11 +10,10 @@
 from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             MultiModalRegistry)
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.multimodal.processing import EncDecMultiModalProcessor
-from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
+from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -296,57 +295,42 @@ def process_inputs(
             pooling_params = params.clone()
 
         # Multimodal related.
-        sorted_mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] = None
+        sorted_mm_inputs: Optional[list[MultiModalKwargsItem]] = None
         sorted_mm_positions: Optional[list[PlaceholderRange]] = None
         sorted_mm_hashes: Optional[list[str]] = None
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
+            decoder_mm_positions = decoder_inputs["mm_placeholders"]
+            decoder_mm_hashes = decoder_inputs.get("mm_hashes")
 
             # Merge and flatten multimodal placeholders, hashes and inputs
             # from dictionaries to lists, and sort them by each item's position
             # in the input sequence.
-            (
-                sorted_item_modalities,
-                sorted_mm_positions,
-                sorted_mm_hashes,
-            ) = merge_and_sort_multimodal_metadata(
-                decoder_inputs["mm_placeholders"],
-                decoder_inputs["mm_hashes"] if return_mm_hashes else None,
-            )
-
-            # The output of merged multi-modal processor (`decoder_mm_inputs`)
-            # is a single MultiModalKwargs for all items from all modalities.
-            # This code flattens kwargs for individual items in a list and
-            # sorts them by each item's position in the input sequence if there
-            # are multiple modalities.
-            unique_modalities = set(sorted_item_modalities)
-            if len(unique_modalities) > 1:
-                orig_sorted_mm_inputs = []
-                used_indices = {modality: 0 for modality in unique_modalities}
-
-                for modality in sorted_item_modalities:
-                    items = decoder_mm_inputs.get_items(modality)
-                    item = items[used_indices[modality]]
-
-                    orig_sorted_mm_inputs.append(
-                        MultiModalKwargs.from_items([item]))
-                    used_indices[modality] += 1
-            else:
-                orig_sorted_mm_inputs = [
-                    MultiModalKwargs.from_items([item]) for item in
-                    decoder_mm_inputs.get_items(sorted_item_modalities[0])
-                ]
+            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
+
+            sorted_mm_inputs = [
+                decoder_mm_inputs.get_item(modality, idx)
+                for modality, idx in sorted_mm_idxs
+            ]
+            sorted_mm_positions = [
+                decoder_mm_positions[modality][idx]
+                for modality, idx in sorted_mm_idxs
+            ]
+            sorted_mm_hashes = None if decoder_mm_hashes is None else [
+                decoder_mm_hashes[modality][idx]
+                for modality, idx in sorted_mm_idxs
+            ]
 
             if sorted_mm_hashes is not None:
                 sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
-                    orig_sorted_mm_inputs, sorted_mm_hashes)
-            else:
-                sorted_mm_inputs = orig_sorted_mm_inputs
+                    sorted_mm_inputs,
+                    sorted_mm_hashes,
+                )
 
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=decoder_inputs["prompt_token_ids"],
-            mm_inputs=sorted_mm_inputs,
+            mm_kwargs=sorted_mm_inputs,
             mm_hashes=sorted_mm_hashes,
             mm_placeholders=sorted_mm_positions,
             sampling_params=sampling_params,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 85f5dcb92eb4..d1f1c7f98755 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -5,7 +5,7 @@
 import time
 from typing import TYPE_CHECKING, Any, Optional, Union
 
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_list_of
@@ -24,7 +24,7 @@ def __init__(
         self,
         request_id: str,
         prompt_token_ids: list[int],
-        multi_modal_inputs: Optional[list[MultiModalKwargs]],
+        multi_modal_kwargs: Optional[list[MultiModalKwargsItem]],
         multi_modal_hashes: Optional[list[str]],
         multi_modal_placeholders: Optional[list[PlaceholderRange]],
         sampling_params: Optional[SamplingParams],
@@ -84,15 +84,15 @@ def __init__(
 
         # Multi-modal related
         self.mm_positions = multi_modal_placeholders or []
-        self.mm_inputs = multi_modal_inputs or []
+        self.mm_kwargs = multi_modal_kwargs or []
         self.mm_hashes: list[str] = multi_modal_hashes or []
-        self.num_encoder_inputs = len(self.mm_inputs)
+        self.num_encoder_inputs = len(self.mm_kwargs)
         self.has_encoder_inputs = self.num_encoder_inputs > 0
 
         # Sanity check
-        assert len(self.mm_inputs) == len(self.mm_positions)
+        assert len(self.mm_kwargs) == len(self.mm_positions)
         if self.mm_hashes:
-            assert len(self.mm_inputs) == len(self.mm_hashes)
+            assert len(self.mm_kwargs) == len(self.mm_hashes)
 
         # Read-only views
         # Prevent directly appending to these lists since
@@ -110,16 +110,15 @@ def __init__(
 
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
-        if request.mm_inputs is not None:
-            assert isinstance(request.mm_inputs, list)
-            assert is_list_of(request.mm_inputs, MultiModalKwargs), (
-                "mm_inputs was not updated in EngineCore.add_request")
+        if request.mm_kwargs is not None:
+            assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), (
+                "mm_kwargs was not updated in EngineCore.add_request")
 
         return cls(
             request_id=request.request_id,
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
-            multi_modal_inputs=request.mm_inputs,
+            multi_modal_kwargs=request.mm_kwargs,
             multi_modal_hashes=request.mm_hashes,
             multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 9d063f1edad0..3f0fad8a64d0 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -113,6 +113,9 @@ def enc_hook(self, obj: Any) -> Any:
                 int(v) if v is not None else None
                 for v in (obj.start, obj.stop, obj.step))
 
+        if isinstance(obj, MultiModalKwargsItem):
+            return self._encode_mm_item(obj)
+
         if isinstance(obj, MultiModalKwargs):
             mm: MultiModalKwargs = obj
             if not mm.modalities:
@@ -120,17 +123,12 @@ def enc_hook(self, obj: Any) -> Any:
                 return dict(mm)
 
             # ignore the main dict, it will be re-indexed.
-            # Encode a list of MultiModalKwargsItems as plain dicts
-            # + special handling for .field.
             # Any tensors *not* indexed by modality will be ignored.
-            return [[{
-                "modality": elem.modality,
-                "key": elem.key,
-                "data": self._encode_nested_tensors(elem.data),
-                "field": self._encode_mm_field(elem.field),
-            } for elem in item.values()]
-                    for itemlist in mm._items_by_modality.values()
-                    for item in itemlist]
+            return [
+                self._encode_mm_item(item)
+                for itemlist in mm._items_by_modality.values()
+                for item in itemlist
+            ]
 
         if isinstance(obj, UtilityResult):
             result = obj.result
@@ -192,6 +190,23 @@ def _encode_tensor(
         dtype = str(obj.dtype).removeprefix("torch.")
         return dtype, obj.shape, data
 
+    def _encode_mm_item(self,
+                        item: MultiModalKwargsItem) -> list[dict[str, Any]]:
+        return [self._encode_mm_field_elem(elem) for elem in item.values()]
+
+    def _encode_mm_field_elem(self,
+                              elem: MultiModalFieldElem) -> dict[str, Any]:
+        return {
+            "modality":
+            elem.modality,
+            "key":
+            elem.key,
+            "data": (None if elem.data is None else
+                     self._encode_nested_tensors(elem.data)),
+            "field":
+            self._encode_mm_field(elem.field),
+        }
+
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
         if isinstance(nt, torch.Tensor):
             return self._encode_tensor(nt)
@@ -250,6 +265,8 @@ def dec_hook(self, t: type, obj: Any) -> Any:
                 return self._decode_tensor(obj)
             if t is slice:
                 return slice(*obj)
+            if issubclass(t, MultiModalKwargsItem):
+                return self._decode_mm_item(obj)
             if issubclass(t, MultiModalKwargs):
                 if isinstance(obj, list):
                     return MultiModalKwargs.from_items(
@@ -311,15 +328,18 @@ def _decode_tensor(self, arr: Any) -> torch.Tensor:
         # Convert back to proper shape & type
         return arr.view(torch_dtype).view(shape)
 
-    def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]:
+    def _decode_mm_items(self, obj: list[Any]) -> list[MultiModalKwargsItem]:
         return [self._decode_mm_item(v) for v in obj]
 
-    def _decode_mm_item(self, obj: list) -> MultiModalKwargsItem:
+    def _decode_mm_item(self, obj: list[Any]) -> MultiModalKwargsItem:
         return MultiModalKwargsItem.from_elems(
             [self._decode_mm_field_elem(v) for v in obj])
 
-    def _decode_mm_field_elem(self, obj: dict) -> MultiModalFieldElem:
-        obj["data"] = self._decode_nested_tensors(obj["data"])
+    def _decode_mm_field_elem(self, obj: dict[str,
+                                              Any]) -> MultiModalFieldElem:
+        if obj["data"] is not None:
+            obj["data"] = self._decode_nested_tensors(obj["data"])
+
         # Reconstruct the field processor using MultiModalFieldConfig
         factory_meth_name, *field_args = obj["field"]
         factory_meth = getattr(MultiModalFieldConfig, factory_meth_name)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d9d0b4bec871..2469e09f8249 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -7,9 +7,11 @@
 
 import numpy as np
 import torch
+from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem,
+                                    PlaceholderRange)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
@@ -29,7 +31,7 @@ class CachedRequestState:
 
     req_id: str
     prompt_token_ids: list[int]
-    mm_inputs: list[MultiModalKwargs]
+    mm_kwargs: list[MultiModalKwargsItem]
     mm_positions: list[PlaceholderRange]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
@@ -51,6 +53,13 @@ def __post_init__(self):
     def num_tokens(self) -> int:
         return self.num_prompt_tokens + len(self.output_token_ids)
 
+    # Temporary back-compatibility for plugins that define model runner
+    @property
+    @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
+                "removed in v0.13. Please use `mm_kwargs` instead.")
+    def mm_inputs(self) -> list[MultiModalKwargs]:
+        return [MultiModalKwargs.from_items([item]) for item in self.mm_kwargs]
+
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
             return self.prompt_token_ids[idx]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2e1cc37b1b76..a03e860a91c7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -40,9 +40,9 @@
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling, is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem,
                                     PlaceholderRange)
-from vllm.multimodal.utils import group_mm_inputs_by_modality
+from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors, PoolerOutput
@@ -478,7 +478,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
-                mm_inputs=new_req_data.mm_inputs,
+                mm_kwargs=new_req_data.mm_kwargs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
                 pooling_params=pooling_params,
@@ -496,18 +496,19 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 second_per_grid_ts = []
                 audio_feature_lengths = []
                 use_audio_in_video = False
-                for mm_input in self.requests[req_id].mm_inputs:
+                for item in self.requests[req_id].mm_kwargs:
+                    mm_input = item.require_data()
                     if mm_input.get("image_grid_thw") is not None:
-                        image_grid_thw.extend(
+                        image_grid_thw.append(
                             mm_input["image_grid_thw"].tolist())
                     if mm_input.get("video_grid_thw") is not None:
-                        video_grid_thw.extend(
+                        video_grid_thw.append(
                             mm_input["video_grid_thw"].tolist())
                     if mm_input.get("second_per_grid_ts") is not None:
-                        second_per_grid_ts.extend(
+                        second_per_grid_ts.append(
                             mm_input["second_per_grid_ts"])
                     if mm_input.get("audio_feature_lengths") is not None:
-                        audio_feature_lengths.extend(
+                        audio_feature_lengths.append(
                             mm_input["audio_feature_lengths"])
                     if mm_input.get("use_audio_in_video") is True:
                         use_audio_in_video = True
@@ -624,14 +625,23 @@ def _extract_mm_kwargs(
     ) -> BatchedTensorInputs:
         if self.is_multimodal_raw_input_supported:  # noqa: SIM102
             if scheduler_output:
-                multi_modal_kwargs_list = list[MultiModalKwargs]()
+                mm_kwargs = list[MultiModalKwargsItem]()
                 for req in scheduler_output.scheduled_new_reqs:
-                    req_mm_inputs = req.mm_inputs
-                    if not isinstance(req_mm_inputs, list):
-                        req_mm_inputs = list(req_mm_inputs)
-                    multi_modal_kwargs_list.extend(req_mm_inputs)
+                    req_mm_kwargs = req.mm_kwargs
+                    if not isinstance(req_mm_kwargs, list):
+                        req_mm_kwargs = list(req_mm_kwargs)
+                    mm_kwargs.extend(req_mm_kwargs)
+
+                # Input all modalities at once
+                mm_kwargs_combined: BatchedTensorInputs = {}
+                for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
+                        mm_kwargs,
+                        device=self.device,
+                        pin_memory=self.pin_memory,
+                ):
+                    mm_kwargs_combined.update(mm_kwargs_group)
 
-                return MultiModalKwargs.batch(multi_modal_kwargs_list)
+                return mm_kwargs_combined
 
         return {}
 
@@ -1146,13 +1156,13 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs = list[MultiModalKwargs]()
+        mm_kwargs = list[MultiModalKwargsItem]()
         req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
 
             for mm_input_id in encoder_input_ids:
-                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
                 req_ids_pos.append(
                     (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
 
@@ -1163,17 +1173,12 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         # in the same batch while still being able to benefit from batching
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
-        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
-
         encoder_outputs = []
-        for grouped_mm_inputs in grouped_mm_inputs_list:
-            batched_mm_inputs = MultiModalKwargs.batch(
-                grouped_mm_inputs, pin_memory=self.pin_memory)
-            batched_mm_inputs = MultiModalKwargs.as_kwargs(
-                batched_mm_inputs,
+        for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
+                mm_kwargs,
                 device=self.device,
-            )
-
+                pin_memory=self.pin_memory,
+        ):
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
             # 1. A tensor of shape (num_items, feature_size, hidden_size)
@@ -1182,11 +1187,11 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             # (feature_size, hidden_size) in case the feature size is dynamic
             # depending on the input multimodal items.
             curr_group_outputs = self.model.get_multimodal_embeddings(
-                **batched_mm_inputs)
+                **mm_kwargs_group)
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
-                expected_num_items=len(grouped_mm_inputs),
+                expected_num_items=num_items,
             )
 
             for output in curr_group_outputs:
@@ -1553,17 +1558,18 @@ def execute_model(
 
             input_ids = None
             inputs_embeds = self.inputs_embeds[:num_input_tokens]
-            model_mm_kwargs = self._extract_mm_kwargs(scheduler_output)
-            model_kwargs = self._init_model_kwargs(num_scheduled_tokens)
+            model_kwargs = {
+                **self._init_model_kwargs(num_scheduled_tokens),
+                **self._extract_mm_kwargs(scheduler_output),
+            }
         else:
             # For text-only models, we use token ids as input.
             # While it is possible to use embeddings as input just like the
             # multimodal models, it is not desirable for performance since
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids[:num_input_tokens]
-            model_kwargs = self._init_model_kwargs(num_input_tokens)
             inputs_embeds = None
-            model_mm_kwargs = {}
+            model_kwargs = self._init_model_kwargs(num_input_tokens)
         if self.uses_mrope:
             positions = self.mrope_positions[:, :num_input_tokens]
         else:
@@ -1596,10 +1602,6 @@ def execute_model(
                 positions=positions,
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
-                **MultiModalKwargs.as_kwargs(
-                    model_mm_kwargs,
-                    device=self.device,
-                ),
                 **model_kwargs,
             )
 
@@ -2196,14 +2198,13 @@ def _get_mm_dummy_batch(
 
         # Result in the maximum GPU consumption of the model
         dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
-        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
-        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
-                                                         max_items_per_batch)
-        return MultiModalKwargs.as_kwargs(
-            batched_dummy_mm_inputs,
-            device=self.device,
-        )
+        return next(mm_kwargs_group
+                    for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
+                        [dummy_mm_item] * max_items_per_batch,
+                        device=self.device,
+                        pin_memory=self.pin_memory,
+                    ))
 
     @torch.inference_mode()
     def _dummy_run(
@@ -2269,15 +2270,17 @@ def _dummy_run(
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
-            model_kwargs = self._init_model_kwargs(num_tokens)
             if self.supports_mm_inputs:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
-                model_mm_kwargs = self._dummy_mm_kwargs(num_reqs)
+                model_kwargs = {
+                    **self._init_model_kwargs(num_tokens),
+                    **self._dummy_mm_kwargs(num_reqs),
+                }
             else:
                 input_ids = self.input_ids[:num_tokens]
                 inputs_embeds = None
-                model_mm_kwargs = {}
+                model_kwargs = self._init_model_kwargs(num_tokens)
 
             if self.uses_mrope:
                 positions = self.mrope_positions[:, :num_tokens]
@@ -2307,10 +2310,6 @@ def _dummy_run(
                     positions=positions,
                     intermediate_tensors=intermediate_tensors,
                     inputs_embeds=inputs_embeds,
-                    **MultiModalKwargs.as_kwargs(
-                        model_mm_kwargs,
-                        device=self.device,
-                    ),
                     **model_kwargs,
                 )
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index ae0219458ecf..46262284e333 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -32,9 +32,9 @@
 from vllm.model_executor.models.interfaces_base import (
     is_pooling_model, is_text_generation_model)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs,
+from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargsItem,
                                     PlaceholderRange)
-from vllm.multimodal.utils import group_mm_inputs_by_modality
+from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (LayerBlockType, cdiv, is_pin_memory_available,
@@ -394,7 +394,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             self.requests[req_id] = CachedRequestState(
                 req_id=req_id,
                 prompt_token_ids=new_req_data.prompt_token_ids,
-                mm_inputs=new_req_data.mm_inputs,
+                mm_kwargs=new_req_data.mm_kwargs,
                 mm_positions=new_req_data.mm_positions,
                 sampling_params=sampling_params,
                 pooling_params=None,
@@ -842,13 +842,13 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs = list[MultiModalKwargs]()
+        mm_kwargs = list[MultiModalKwargsItem]()
         req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
 
             for mm_input_id in encoder_input_ids:
-                mm_inputs.append(req_state.mm_inputs[mm_input_id])
+                mm_kwargs.append(req_state.mm_kwargs[mm_input_id])
                 req_ids_pos.append(
                     (req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
 
@@ -859,16 +859,12 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         # in the same batch while still being able to benefit from batching
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
-        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
-
         encoder_outputs = []
-        for grouped_mm_inputs in grouped_mm_inputs_list:
-            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
-            batched_mm_inputs = MultiModalKwargs.as_kwargs(
-                batched_mm_inputs,
+        for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
+                mm_kwargs,
                 device=self.device,
-            )
-
+                pin_memory=self.pin_memory,
+        ):
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
             # 1. A tensor of shape (num_items, feature_size, hidden_size)
@@ -878,12 +874,12 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             # depending on the input multimodal items.
             xm.mark_step()
             curr_group_outputs = self.model.get_multimodal_embeddings(
-                **batched_mm_inputs)
+                **mm_kwargs_group)
             xm.mark_step()
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
-                expected_num_items=len(grouped_mm_inputs),
+                expected_num_items=num_items,
             )
 
             if isinstance(curr_group_outputs, torch.Tensor):
@@ -1823,14 +1819,13 @@ def _get_mm_dummy_batch(
 
         # Result in the maximum GPU consumption of the model
         dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
-        dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
-        batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
-                                                         max_items_per_batch)
-        return MultiModalKwargs.as_kwargs(
-            batched_dummy_mm_inputs,
-            device=self.device,
-        )
+        return next(grouped_mm_kwargs
+                    for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
+                        [dummy_mm_item] * max_items_per_batch,
+                        device=self.device,
+                        pin_memory=self.pin_memory,
+                    ))
 
 
 def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:

From 550ede2088bbc338281f91ea3ef0beb98d94f048 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Wed, 13 Aug 2025 07:22:56 -0700
Subject: [PATCH 041/233] [Misc] clear and separate error messages for input
 too long and input + max-tokens too long (#22803)

Signed-off-by: Roger Wang <hey@rogerw.me>
---
 vllm/entrypoints/openai/serving_engine.py | 31 +++++++++++++----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index fb9d456df78e..d6f92a63301e 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -585,6 +585,8 @@ def _validate_input(
                       (EmbeddingChatRequest, EmbeddingCompletionRequest,
                        ScoreRequest, RerankRequest, ClassificationRequest)):
 
+            # Note: input length can be up to the entire model context length
+            # since these requests don't generate tokens.
             if token_num > self.max_model_len:
                 operations: dict[type[AnyRequest], str] = {
                     ScoreRequest: "score",
@@ -613,21 +615,24 @@ def _validate_input(
             max_tokens = request.max_completion_tokens or request.max_tokens
         else:
             max_tokens = getattr(request, "max_tokens", None)
-        if max_tokens is None:
-            if token_num >= self.max_model_len:
-                raise ValueError(
-                    f"This model's maximum context length is "
-                    f"{self.max_model_len} tokens. However, you requested "
-                    f"{token_num} tokens in the messages, "
-                    f"Please reduce the length of the messages.")
-        elif token_num + max_tokens > self.max_model_len:
+
+        # Note: input length can be up to model context length - 1 for
+        # completion-like requests.
+        if token_num >= self.max_model_len:
             raise ValueError(
                 f"This model's maximum context length is "
-                f"{self.max_model_len} tokens. However, you requested "
-                f"{max_tokens + token_num} tokens "
-                f"({token_num} in the messages, "
-                f"{max_tokens} in the completion). "
-                f"Please reduce the length of the messages or completion.")
+                f"{self.max_model_len} tokens. However, your request has "
+                f"{token_num} input tokens. Please reduce the length of "
+                "the input messages.")
+
+        if max_tokens is not None and \
+            token_num + max_tokens > self.max_model_len:
+            raise ValueError(
+                "'max_tokens' or 'max_completion_tokens' is too large: "
+                f"{max_tokens}. This model's maximum context length is "
+                f"{self.max_model_len} tokens and your request has "
+                f"{token_num} input tokens ({max_tokens} > {self.max_model_len}"
+                f" - {token_num}).")
 
         return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
 

From ba5e94af5c6897ecbefc34019f8ac33b3d322f5b Mon Sep 17 00:00:00 2001
From: HWH <67449739+jio-H@users.noreply.github.com>
Date: Thu, 14 Aug 2025 00:41:41 +0800
Subject: [PATCH 042/233] [Bugfix] Fix MiniCPMV Image input inference failed
 (#22813)

Signed-off-by: HWH <67449739+jio-H@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/model_executor/models/minicpmv.py | 17 +++++++
 vllm/utils/tensor_schema.py            | 70 ++++++++++++++++----------
 2 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 7db3a1bb90b4..88dd1a57626f 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -85,6 +85,23 @@ class MiniCPMVImagePixelInputs(TensorSchema):
         - w: Width
     """
 
+    def _validate_nested_tensors(
+        self,
+        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
+        field_name: str,
+        expected_shape: tuple[Union[int, str], ...],
+        dynamic_dims: set[str],
+    ) -> tuple[int, ...]:
+        # value[0] is the scaled image,
+        # and value[1:] is a collection of image slices.
+        # It is ensured that all slices in the collection
+        # have the same shape.
+        if field_name == "pixel_values":
+            value = value[1:] if len(value) > 1 else value
+
+        return super()._validate_nested_tensors(value, field_name,
+                                                expected_shape, dynamic_dims)
+
     type: Literal["pixel_values"] = "pixel_values"
 
     # Note that the image size may vary, so we pass it as a list instead of a
diff --git a/vllm/utils/tensor_schema.py b/vllm/utils/tensor_schema.py
index 4c3acf0094c7..21d3249fe154 100644
--- a/vllm/utils/tensor_schema.py
+++ b/vllm/utils/tensor_schema.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Annotated, Any, Union, get_args, get_origin, get_type_hints
+from typing import (Annotated, Any, Optional, Union, get_args, get_origin,
+                    get_type_hints)
 
 import torch
 
@@ -11,9 +12,13 @@
 
 class TensorShape:
 
-    def __init__(self,
-                 *dims: Union[int, str],
-                 dynamic_dims: set[str, ...] = None) -> None:
+    def __init__(
+        self,
+        *dims: Union[int, str],
+        dynamic_dims: Optional[set[str]] = None,
+    ) -> None:
+        super().__init__()
+
         self.dims = dims
         self.dynamic_dims = dynamic_dims if dynamic_dims else set()
 
@@ -44,11 +49,15 @@ def __str__(self) -> str:
 
 class TensorSchema:
 
-    def __init__(self,
-                 *,
-                 validate: bool = True,
-                 resolve_bindings: dict[str, int] = None,
-                 **kwargs: Any) -> None:
+    def __init__(
+        self,
+        *,
+        validate: bool = True,
+        resolve_bindings: Optional[dict[str, int]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
         self._resolve_bindings = resolve_bindings if resolve_bindings else {}
 
         for key, value in kwargs.items():
@@ -57,16 +66,19 @@ def __init__(self,
         if validate:
             self.validate()
 
-    def __getitem__(self, item) -> Any:
-        return getattr(self, item)
+    def __getitem__(self, key: str) -> Any:
+        return getattr(self, key)
 
-    def get(self, item, default=None) -> Any:
-        return getattr(self, item, default)
+    def get(self, key: str, default: Any = None) -> Any:
+        return getattr(self, key, default)
 
-    def _match_shape_with_dynamic(self, actual: tuple[int, ...],
-                                  reference: tuple[int, ...],
-                                  expected_shape: tuple[Union[int, str], ...],
-                                  dynamic_dims: set[str, ...]) -> bool:
+    def _match_shape_with_dynamic(
+        self,
+        actual: tuple[int, ...],
+        reference: tuple[int, ...],
+        expected_shape: tuple[Union[int, str], ...],
+        dynamic_dims: set[str],
+    ) -> bool:
         if len(actual) != len(reference) or len(actual) > len(expected_shape):
             return False
 
@@ -84,10 +96,12 @@ def _match_shape_with_dynamic(self, actual: tuple[int, ...],
         return True
 
     def _validate_nested_tensors(
-            self, value: Union[list[torch.Tensor, ...],
-                               tuple[torch.Tensor, ...]], field_name: str,
-            expected_shape: tuple[Union[int, str], ...],
-            dynamic_dims: set[str, ...]) -> tuple[int, ...]:
+        self,
+        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
+        field_name: str,
+        expected_shape: tuple[Union[int, str], ...],
+        dynamic_dims: set[str],
+    ) -> tuple[int, ...]:
         """Validate a list/tuple of tensors and return the actual shape."""
         # Ensure all tensors in the list have the same
         # shape, besides dynamic dimensions
@@ -110,12 +124,14 @@ def _validate_nested_tensors(
         # shape = (len(list), *tensor.shape)
         return (len(value), ) + first.shape
 
-    def _validate_tensor_shape_expected(self, actual_shape: tuple[int, ...],
-                                        expected_shape: tuple[Union[int, str],
-                                                              ...],
-                                        field_name: str, shape_env: dict[str,
-                                                                         int],
-                                        dynamic_dims: set[str, ...]) -> None:
+    def _validate_tensor_shape_expected(
+        self,
+        actual_shape: tuple[int, ...],
+        expected_shape: tuple[Union[int, str], ...],
+        field_name: str,
+        shape_env: dict[str, int],
+        dynamic_dims: set[str],
+    ) -> None:
         """Validate that the actual tensor shape matches the expected shape."""
 
         if len(actual_shape) != len(expected_shape):

From e2b838c7e2c0c0f9249f7d921c54027a4400c143 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 01:03:05 +0800
Subject: [PATCH 043/233] [CI/Build] Update VLM common tests (#22841)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/generation/test_common.py      | 16 +---------------
 vllm/model_executor/models/minicpmv.py        | 19 +------------------
 2 files changed, 2 insertions(+), 33 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2a65d7e244d7..2919bdbe91bb 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -561,7 +561,7 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
-        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
         marks=[pytest.mark.skip("HF import fails")],
     ),
     "minicpmv_26": VLMTestInfo(
@@ -574,8 +574,6 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
-        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
-        marks=[pytest.mark.skip("HF import fails")],
     ),
     "minimax_vl_01": VLMTestInfo(
         models=["MiniMaxAI/MiniMax-VL-01"],
@@ -611,18 +609,6 @@
         patch_hf_runner=model_utils.ovis_patch_hf_runner,
         marks=[large_gpu_mark(min_gb=32)],
     ),
-    "ovis1_6": VLMTestInfo(
-        models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
-        max_model_len=4096,
-        max_num_seqs=2,
-        dtype="half",
-        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
-        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
-        patch_hf_runner=model_utils.ovis_patch_hf_runner,
-    ),
     "ovis2": VLMTestInfo(
         models=["AIDC-AI/Ovis2-1B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 88dd1a57626f..47ce771d8c90 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -85,30 +85,13 @@ class MiniCPMVImagePixelInputs(TensorSchema):
         - w: Width
     """
 
-    def _validate_nested_tensors(
-        self,
-        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
-        field_name: str,
-        expected_shape: tuple[Union[int, str], ...],
-        dynamic_dims: set[str],
-    ) -> tuple[int, ...]:
-        # value[0] is the scaled image,
-        # and value[1:] is a collection of image slices.
-        # It is ensured that all slices in the collection
-        # have the same shape.
-        if field_name == "pixel_values":
-            value = value[1:] if len(value) > 1 else value
-
-        return super()._validate_nested_tensors(value, field_name,
-                                                expected_shape, dynamic_dims)
-
     type: Literal["pixel_values"] = "pixel_values"
 
     # Note that the image size may vary, so we pass it as a list instead of a
     # batched tensor.
     pixel_values: Annotated[
         list[torch.Tensor],
-        TensorShape("bns", "c", "h", "w"),
+        TensorShape("bns", "c", "h", "w", dynamic_dims={"h", "w"}),
     ]
     tgt_sizes: Annotated[
         torch.Tensor,

From 3f3dd9e80b0a20ef9b4cce04760a6e162fd4c2ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Wed, 13 Aug 2025 19:35:50 +0200
Subject: [PATCH 044/233] [CI] Fix
 `tests/v1/e2e/test_kv_sharing_fast_prefill.py` import on test (#22815)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index f5a7b9cc276b..d72e50e5196b 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -11,7 +11,8 @@
 from vllm.config import CompilationConfig, CompilationLevel
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
+from vllm.model_executor.models.gemma3n_mm import (
+    Gemma3nForConditionalGeneration)
 from vllm.model_executor.models.registry import ModelRegistry
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.sequence import IntermediateTensors
@@ -32,12 +33,13 @@ def forward(
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds, **kwargs)
+        hidden_states = super().forward(input_ids, positions,
+                                        intermediate_tensors, inputs_embeds,
+                                        **kwargs)
         attn_metadata = get_forward_context().attn_metadata
         # attn_metadata is None during dummy runs
         if (attn_metadata is not None
-                and self.cache_config.kv_sharing_fast_prefill):
+                and self.language_model.cache_config.kv_sharing_fast_prefill):
             assert isinstance(attn_metadata, dict)  # true in V1
             # Gemma3n-E2B has 30 layers, with last 20 layers being
             # cross-decoder layers. Check attention metadata is correct
@@ -52,7 +54,7 @@ def forward(
 
             # Last layer will be a KV sharing layer
             layer_attn_metadata = attn_metadata[
-                self.model.language_model.layers[-1].self_attn.attn.layer_name]
+                self.language_model.model.layers[-1].self_attn.attn.layer_name]
             logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
             assert logits_indices_padded is not None
             num_logits_indices = layer_attn_metadata.num_logits_indices

From 5a1b862f94b962c254a5328c6cad6ca2f3b92309 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 01:55:25 +0800
Subject: [PATCH 045/233] [CI/Build] Fix param mismatch in
 `test_eagle_correctness` (#22847)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/e2e/test_spec_decode.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 599916c0d1cf..dde95fbe590b 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -146,7 +146,11 @@ def test_ngram_correctness(
             marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
     ],
     ids=[
-        "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle",
+        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
+        # "qwen3_eagle3",
+        "llama3_eagle",
+        "llama3_eagle3",
+        "llama4_eagle",
         "llama4_eagle_mm"
     ])
 @pytest.mark.parametrize("attn_backend",

From caec8476ef0a01881100d51cd7a14596ada22992 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 14 Aug 2025 04:36:28 +0800
Subject: [PATCH 046/233] [CI/Build] Skip gpt_big model test because of broken
 HF model (#22848)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/registry.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index d7d20d1f3abf..eb48c0f6a773 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -195,7 +195,8 @@ def check_available_online(
     "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
                                        {"alias": "gpt2"}),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
-                                             {"tiny": "bigcode/tiny_starcoder_py"}),  # noqa: E501
+                                             extras={"tiny": "bigcode/tiny_starcoder_py"},  # noqa: E501
+                                             min_transformers_version="4.55.1"),
     "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
                                        {"6b": "EleutherAI/gpt-j-6b"}),
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",

From 1e28f21138dcf8a4ba6a39ff7b2d1739774686bc Mon Sep 17 00:00:00 2001
From: kliuae <17350011+kliuae@users.noreply.github.com>
Date: Thu, 14 Aug 2025 04:45:03 +0800
Subject: [PATCH 047/233] [ROCm][Bugfix] Fix compilation error in topk softmax
 fused kernel (#22819)

Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
---
 csrc/moe/topk_softmax_kernels.cu | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 946c137db636..99c52ef17d08 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -423,12 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
         input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
 }
 
+#ifndef USE_ROCM
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
-    static_assert(WARP_SIZE == 32 || WARP_SIZE == 64,                                 \
-                  "Unsupported warp size. Only 32 and 64 are supported.");            \
+    static_assert(WARP_SIZE == 32,                                                    \
+                  "Unsupported warp size. Only 32 is supported for CUDA");            \
     topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
         gating_output, nullptr, topk_weights, topk_indices,                           \
         token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+#else
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
+    if (WARP_SIZE == 64) {                                                               \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
+            gating_output, nullptr, topk_weights, topk_indices,                          \
+            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+    } else if (WARP_SIZE == 32) {                                                        \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
+            gating_output, nullptr, topk_weights, topk_indices,                          \
+            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+    } else {                                                                             \
+        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
+    }
+#endif
 
 template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
@@ -443,7 +458,9 @@ void topkGatingSoftmaxKernelLauncher(
     cudaStream_t stream) {
     static constexpr int WARPS_PER_TB = 4;
     static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
+#ifndef USE_ROCM
     static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+#endif
     switch (num_experts) {
         case 1:
             LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);

From 8e2680d5e6ad735ea8dd674236e5f6f3a56b687b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Wed, 13 Aug 2025 17:38:35 -0400
Subject: [PATCH 048/233] Move checklist in PR template (#22852)

Signed-off-by: Luka Govedic <lgovedic@redhat.com>
---
 .github/PULL_REQUEST_TEMPLATE.md   | 20 ++++++++++++--------
 .github/scripts/cleanup_pr_body.sh |  8 ++++----
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index d4aceab4472f..1b30c1292df8 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,11 +1,5 @@
-# Essential Elements of an Effective PR Description Checklist
-
-- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
-- [ ] The test plan, such as providing test command.
-- [ ] The test results, such as pasting the results comparison before and after, or e2e results
-- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
-
-PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+<!-- markdownlint-disable -->
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
 
 ## Purpose
 
@@ -15,4 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
 
 ## (Optional) Documentation Update
 
+---
+<details>
+<summary> Essential Elements of an Effective PR Description Checklist </summary>
+
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
+- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+</details>
+
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
index 8d65936fba1d..25af344aab2b 100755
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
 gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 cp "${OLD}" "${NEW}"
 
-# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
-sed -i '/FIX #xxxx.*$/d' "${NEW}"
+# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
+sed -i '/<!--.*-->$/d' "${NEW}"
 
-# Remove "FILL IN THE PR DESCRIPTION HERE"
-sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
+sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
 
 # Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
 sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"

From 72327eddd24b44aab4df7f72292716fdf6e101ca Mon Sep 17 00:00:00 2001
From: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Date: Wed, 13 Aug 2025 14:44:06 -0700
Subject: [PATCH 049/233] [Core] [N-gram SD Optimization][1/n] Propose tokens
 with a single KMP (#22437)

Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
---
 benchmarks/benchmark_block_pool.py          |  74 ++++++++++
 benchmarks/benchmark_ngram_proposer.py      | 112 +++++++++++++++
 benchmarks/benchmark_utils.py               |  55 +++++++-
 benchmarks/kv_cache/benchmark_block_pool.py | 108 ---------------
 tests/v1/spec_decode/test_ngram.py          | 102 +++++++++-----
 vllm/v1/spec_decode/ngram_proposer.py       | 145 ++++++++++++--------
 6 files changed, 389 insertions(+), 207 deletions(-)
 create mode 100644 benchmarks/benchmark_block_pool.py
 create mode 100644 benchmarks/benchmark_ngram_proposer.py
 delete mode 100644 benchmarks/kv_cache/benchmark_block_pool.py

diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py
new file mode 100644
index 000000000000..fd363c2ad051
--- /dev/null
+++ b/benchmarks/benchmark_block_pool.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+from tabulate import tabulate
+
+from benchmark_utils import TimeCollector
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+def main(args):
+    rows = []
+    for allocate_block in args.allocate_blocks:
+        # Enforce a GC collect ahead to minimize the impact among runs
+        gc.collect()
+        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+        get_blocks_times = TimeCollector(TimeCollector.US)
+        free_blocks_times = TimeCollector(TimeCollector.US)
+        for _ in range(args.num_iteration):
+            with get_blocks_times:
+                blocks = block_pool.get_new_blocks(allocate_block)
+            with free_blocks_times:
+                block_pool.free_blocks(blocks)
+
+        rows.append(
+            [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
+            + get_blocks_times.dump_avg_max()
+            + free_blocks_times.dump_avg_max()
+        )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "Iterations",
+                "Total\nBlocks",
+                "Allocated\nBlocks",
+                "Get Blocks\nAvg (us)",
+                "Get Blocks\nMax (us)",
+                "Free Blocks\nAvg (us)",
+                "Free Blocks\nMax (us)",
+            ],
+            tablefmt="grid",
+            floatfmt=".3f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of BlockPool for KV Cache."
+    )
+    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=1000,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--allocate-blocks",
+        type=int,
+        nargs="*",
+        default=[10, 50, 100, 500, 1000],
+        help="Number of blocks to allocate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py
new file mode 100644
index 000000000000..c60040d05ab7
--- /dev/null
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+import numpy as np
+from tabulate import tabulate
+
+from benchmark_utils import TimeCollector
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+
+
+def main(args):
+    rows = []
+    for max_ngram in args.max_ngram:
+        collector = TimeCollector(TimeCollector.US)
+
+        model_config = ModelConfig(
+            model="facebook/opt-125m",
+            task="generate",
+            max_model_len=args.num_token + args.num_spec_token,
+            tokenizer="facebook/opt-125m",
+            tokenizer_mode="auto",
+            dtype="auto",
+            seed=None,
+            trust_remote_code=False,
+        )
+        proposer = NgramProposer(
+            vllm_config=VllmConfig(
+                model_config=model_config,
+                speculative_config=SpeculativeConfig(
+                    prompt_lookup_min=args.min_ngram,
+                    prompt_lookup_max=max_ngram,
+                    num_speculative_tokens=args.num_spec_token,
+                    method="ngram",
+                ),
+            )
+        )
+
+        # Warm up
+        proposer.propose(np.random.randint(0, 20, (args.num_token,)))
+
+        gc.collect()
+        for _ in range(args.num_iteration):
+            tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
+            with collector:
+                for i in range(args.num_req):
+                    proposer.propose(tokens[i, :])
+        rows.append(
+            [args.num_req, args.num_token, args.min_ngram, max_ngram]
+            + collector.dump_avg_max()
+        )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "# Request",
+                "# Token",
+                "Min Ngram",
+                "Max Ngram",
+                "Avg (us)",
+                "Max (us)",
+            ],
+            tablefmt="grid",
+            floatfmt=".3f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of N-gram speculative decode drafting"
+    )
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=100,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--num-req", type=int, default=128, help="Number of requests in the batch"
+    )
+    parser.add_argument(
+        "--num-token", type=int, default=1500, help="Number of tokens for each request"
+    )
+    parser.add_argument(
+        "--min-ngram",
+        type=int,
+        default=3,
+        help="Minimum n-gram to match",
+    )
+    parser.add_argument(
+        "--max-ngram",
+        type=int,
+        nargs="*",
+        default=[5, 7, 10, 15, 20],
+        help="Maximum n-gram to match",
+    )
+    parser.add_argument(
+        "--num-spec-token",
+        type=int,
+        default=3,
+        help="Number of speculative tokens to generate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 283f938df50a..98624abdf49f 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import argparse
 import json
 import math
 import os
-from typing import Any
+import time
+from types import TracebackType
+from typing import Any, Optional, Union
 
 
 def convert_to_pytorch_benchmark_format(
@@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None:
             cls=InfEncoder,
             default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
         )
+
+
+# Collect time and generate time metrics
+#
+# Example Usage:
+#   collector = TimeCollector(TimeCollector.US)
+#   for _ in range(total_iteration):
+#      with collector:
+#          ...
+#   collector.dump_avg_max()
+class TimeCollector:
+    NS: int = 1
+    US: int = NS * 1000
+    MS: int = US * 1000
+    S: int = MS * 1000
+
+    def __init__(self, scale: int) -> None:
+        self.cnt: int = 0
+        self._sum: int = 0
+        self._max: Optional[int] = None
+        self.scale = scale
+        self.start_time: int = time.monotonic_ns()
+
+    def collect(self, v: int) -> None:
+        self.cnt += 1
+        self._sum += v
+        if self._max is None:
+            self._max = v
+        else:
+            self._max = max(self._max, v)
+
+    def avg(self) -> Union[float, str]:
+        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
+
+    def max(self) -> Union[float, str]:
+        return self._max / self.scale if self._max else "N/A"
+
+    def dump_avg_max(self) -> list[Union[float, str]]:
+        return [self.avg(), self.max()]
+
+    def __enter__(self) -> None:
+        self.start_time = time.monotonic_ns()
+
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc_value: Optional[BaseException],
+        exc_traceback: Optional[TracebackType],
+    ) -> None:
+        self.collect(time.monotonic_ns() - self.start_time)
diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py
deleted file mode 100644
index 134551bb6128..000000000000
--- a/benchmarks/kv_cache/benchmark_block_pool.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
-import time
-from typing import Optional
-
-from tabulate import tabulate
-
-from vllm.utils import FlexibleArgumentParser
-from vllm.v1.core.block_pool import BlockPool
-
-
-class Metric:
-    def __init__(self) -> None:
-        self.cnt: int = 0
-        self.sum_v: int = 0
-        self.max_v: Optional[int] = None
-
-    def update(self, v: int) -> None:
-        self.cnt += 1
-        self.sum_v += v
-        if self.max_v is None:
-            self.max_v = v
-        else:
-            self.max_v = max(self.max_v, v)
-
-    def avg_v(self) -> float:
-        return self.sum_v * 1.0 / self.cnt
-
-
-def main(args):
-    rows = []
-    for allocate_block in args.allocate_blocks:
-        # Enforce a GC collect ahead to minimize the impact among runs
-        gc.collect()
-        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
-
-        get_blocks_metric: Metric = Metric()
-        free_blocks_metric: Metric = Metric()
-        for _ in range(args.num_iteration):
-            t1 = time.monotonic_ns()
-            blocks = block_pool.get_new_blocks(allocate_block)
-            t2 = time.monotonic_ns()
-            block_pool.free_blocks(blocks)
-            t3 = time.monotonic_ns()
-            get_blocks_metric.update(t2 - t1)
-            free_blocks_metric.update(t3 - t2)
-
-        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
-            rows.append(
-                [
-                    get_blocks_metric.cnt,
-                    args.num_gpu_blocks,
-                    allocate_block,
-                    get_blocks_metric.avg_v() / 1000000,
-                    get_blocks_metric.max_v / 1000000.0,
-                    free_blocks_metric.avg_v() / 1000000,
-                    free_blocks_metric.max_v / 1000000.0,
-                ]
-            )
-        else:
-            print(
-                "No valid metrics found."
-                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
-            )
-
-    print(
-        tabulate(
-            rows,
-            headers=[
-                "Iterations",
-                "Total\nBlocks",
-                "Allocated\nBlocks",
-                "Get Blocks\nAvg (ms)",
-                "Get Blocks\nMax (ms)",
-                "Free Blocks\nAvg (ms)",
-                "Free Blocks\nMax (ms)",
-            ],
-            tablefmt="grid",
-            floatfmt=".6f",
-        )
-    )
-
-
-def invoke_main() -> None:
-    parser = FlexibleArgumentParser(
-        description="Benchmark the performance of BlockPool for KV Cache."
-    )
-    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
-    parser.add_argument(
-        "--num-iteration",
-        type=int,
-        default=1000,
-        help="Number of iterations to run to stablize final data readings",
-    )
-    parser.add_argument(
-        "--allocate-blocks",
-        type=int,
-        nargs="*",
-        default=[10, 50, 100, 500, 1000],
-        help="Number of blocks to allocate",
-    )
-    args = parser.parse_args()
-    main(args)
-
-
-if __name__ == "__main__":
-    invoke_main()  # pragma: no cover
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index b7303e0443d3..4193f4041b32 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -1,43 +1,63 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import numpy as np
 
 from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
-from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
-                                                _find_subarray_kmp,
-                                                _kmp_lps_array)
+from vllm.v1.spec_decode.ngram_proposer import (
+    NgramProposer, _find_longest_matched_ngram_and_propose_tokens)
 
 
-def test_kmp_lps_array():
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
-                                  np.array([0, 1, 2]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
-                                  np.array([0, 0, 0, 0]))
-    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
-                                  np.array([0, 0, 1, 2, 0]))
+def test_find_longest_matched_ngram_and_propose_tokens():
+    tokens = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
+    assert _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                          min_ngram=2,
+                                                          max_ngram=2,
+                                                          max_model_len=1024,
+                                                          k=2) is None
 
+    tokens = np.array([1, 2, 3, 4, 1, 2, 3])
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=2,
+                                                       max_ngram=2,
+                                                       max_model_len=1024,
+                                                       k=3),
+        np.array([4, 1, 2]))
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=2,
+                                                       max_ngram=2,
+                                                       max_model_len=1024,
+                                                       k=2), np.array([4, 1]))
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=1,
+                                                       max_ngram=1,
+                                                       max_model_len=1024,
+                                                       k=3),
+        np.array([4, 1, 2]))
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=1,
+                                                       max_ngram=1,
+                                                       max_model_len=1024,
+                                                       k=2), np.array([4, 1]))
 
-def test_find_subarray_kmp():
-    X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
-    assert _find_subarray_kmp(X, 2, 2) is None
-    X = np.array([1, 2, 3, 4, 1, 2, 3])
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
-                                  np.array([4, 1, 2]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
-                                                                         1]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
-                                  np.array([4, 1, 2]))
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
-                                                                         1]))
-    X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
-                                  np.array([4, 1, 2]))
+    tokens = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=2,
+                                                       max_ngram=2,
+                                                       max_model_len=1024,
+                                                       k=3),
+        np.array([4, 1, 2]))
     # Return on the first match
-    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
-                                  np.array([6, 2, 3]))
+    np.testing.assert_array_equal(
+        _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
+                                                       min_ngram=1,
+                                                       max_ngram=1,
+                                                       max_model_len=1024,
+                                                       k=2), np.array([6, 2]))
 
 
 def test_ngram_proposer():
@@ -56,27 +76,35 @@ def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
 
     # No match.
     result = ngram_proposer(
-        2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
+        min_n=2, max_n=2,
+        k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
     assert result is None
 
     # No match for 4-gram.
     result = ngram_proposer(
-        4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+        min_n=4, max_n=4,
+        k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert result is None
 
     # No match for 4-gram but match for 3-gram.
     result = ngram_proposer(
-        3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
+        min_n=3, max_n=4,
+        k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
     assert np.array_equal(result, np.array([4, 1]))
 
     # Match for both 4-gram and 3-gram.
     # In this case, the proposer should return the 4-gram match.
-    result = ngram_proposer(3, 4, 2).propose(
+    result = ngram_proposer(min_n=3, max_n=4, k=2).propose(
         context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
 
     # Match for 2-gram and 3-gram, but not 4-gram.
-    result = ngram_proposer(
-        2, 4,
-        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
+    result = ngram_proposer(min_n=2, max_n=4, k=2).propose(
+        context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
     assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
+
+    # Multiple 3-gram matched, but always pick the first one.
+    result = ngram_proposer(
+        min_n=3, max_n=3, k=2).propose(context_token_ids=np.array(
+            [1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]))
+    assert np.array_equal(result, np.array([100, 1]))
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 6b90d0970bd7..fbcf2cb50d37 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -11,6 +11,10 @@
 class NgramProposer:
 
     def __init__(self, vllm_config: VllmConfig):
+        assert vllm_config.speculative_config is not None
+        assert vllm_config.speculative_config.prompt_lookup_min is not None
+        assert vllm_config.speculative_config.prompt_lookup_max is not None
+
         # Minimum length of the n-gram to match.
         self.min_n = vllm_config.speculative_config.prompt_lookup_min
         # Maximum length of the n-gram to match.
@@ -54,17 +58,13 @@ def propose(
               followed that pattern. Here we will return [4,2,3] because 
               we only have three tokens after the match.
         """
-        # Do not generate draft tokens beyond the max model length.
-        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
-        if k <= 0:
-            return None
-
         # TODO(woosuk): Optimize this.
-        for n in range(self.max_n, self.min_n - 1, -1):
-            result = _find_subarray_kmp(context_token_ids, n, k)
-            if result is not None:
-                return result
-        return None
+        return _find_longest_matched_ngram_and_propose_tokens(
+            origin_tokens=context_token_ids,
+            min_ngram=self.min_n,
+            max_ngram=self.max_n,
+            max_model_len=self.max_model_len,
+            k=self.k)
 
     def load_model(self, *args, **kwargs):
         # No model to load.
@@ -72,61 +72,86 @@ def load_model(self, *args, **kwargs):
 
 
 @jit(nopython=True)
-def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
+def _find_longest_matched_ngram_and_propose_tokens(
+        origin_tokens: np.ndarray, min_ngram: int, max_ngram: int,
+        max_model_len: int, k: int) -> Optional[np.ndarray]:
     """
-    Build the lps (longest proper prefix which is also suffix) 
-    array for the pattern.
+    Find the longest n-gram which matches the suffix of the given tokens
+    whose length is within [min_ngram, max_ngram] (inclusive).
+
+    If found, we will extract k right after the matched ngram.
     """
-    lps = np.zeros(len(pattern), dtype=np.int32)
-    prev_lps = 0  # length of the previous longest prefix suffix
-    i = 1
+    # Do not generate draft tokens is context is shorter than minimum n-gram
+    total_token = origin_tokens.shape[0]
+    if total_token < min_ngram:
+        return None
+
+    # Do not generate draft tokens beyond the max model length.
+    k = min(k, max_model_len - total_token)
+    if k <= 0:
+        return None
+
+    # Flip tokens, and the goal become to find longest ngram
+    # on the rightmost position which matches the prefix with
+    # length [min_n, max_n] (inclusive).
+    tokens = origin_tokens[::-1]
 
-    while i < len(pattern):
-        if pattern[i] == pattern[prev_lps]:
+    # Longest prefix (not including itself) which is a suffix of
+    # the current position.
+    #   lps[i] = max{v, where tokens[0:v] == tokens[i+1-v:i+1]}
+    #
+    # As ngram is capped by max_ngram to save memory, we only need to
+    # store lps for the first max_ngram prefix.
+    lps = np.zeros(max_ngram, dtype=np.int32)
+
+    longest_ngram = 0
+    position = 0
+
+    # lps[0] always equal to 0, we starts with index 1
+    prev_lps = 0
+    i = 1
+    while i < total_token:
+        # tokens[:prev_lps] is the longest prefix as a suffix of tokens[:i]
+        if tokens[prev_lps] == tokens[i]:
+            # Token match: tokens[:prev_lps+1] is the longest prefix as
+            # a suffix of tokens[:i+1]
             prev_lps += 1
-            lps[i] = prev_lps
+            # Check if we found a longer valid ngram.
+            #
+            # Update position when longest_ngram matched prev_lps,
+            # as we want to get the target n-gram of the earliest position
+            # in the original tokens (i.e.
+            # latest position in the reversed tokens)
+            if prev_lps >= longest_ngram:
+                longest_ngram = prev_lps
+                position = i
+            if i < max_ngram:
+                # Store LPS for the first max_ngram prefix
+                lps[i] = prev_lps
+            if prev_lps == max_ngram:
+                # When prev_lps reached max_ngram, update prev_lps
+                # to lps[max_ngram-1] to avoid matching ngram
+                # longer than max_ngram
+                prev_lps = lps[max_ngram - 1]
             i += 1
+        elif prev_lps != 0:
+            # Token mismatch: try the second longest prefix
+            # among all suffix of tokens[:i],
+            # which is the longest prefix of tokens[:prev_lps]
+            prev_lps = lps[prev_lps - 1]
         else:
-            if prev_lps != 0:
-                prev_lps = lps[prev_lps - 1]
-            else:
-                lps[i] = 0
-                i += 1
-    return lps
-
-
-@jit(nopython=True)
-def _find_subarray_kmp(
-    context_token_ids: np.ndarray,
-    n: int,
-    k: int,
-) -> Optional[np.ndarray]:
-    context_len = context_token_ids.shape[0]
-    assert n > 0
-
-    pattern = context_token_ids[-n:]
-    # Precompute lps array for Y
-    lps = _kmp_lps_array(pattern)
-
-    i = 0
-    j = 0
-    # -n because the last n tokens are used as pattern
-    while i < context_len - n:
-        if context_token_ids[i] == pattern[j]:
+            # Token mismatch, and no more prefix (except empty string)
+            # as a suffix of tokens[:i]
             i += 1
-            j += 1
 
-            # If we have matched the entire Y
-            if j == n:
-                # Found pattern in context, gather the next K elements
-                return context_token_ids[i:i + k]
-        else:
-            # Mismatch
-            if j != 0:
-                # Use the lps array to avoid re-checking elements
-                j = lps[j - 1]
-            else:
-                i += 1
-
-    # Y not found
-    return None
+    if longest_ngram < min_ngram:
+        # No valid ngram is found
+        return None
+
+    # Flip the position back, so in origin_tokens,
+    # origin_tokens[total_token-1-position:total_token-1-position+longest_ngram]
+    # is the matched ngram, so we should start drafting tokens from
+    # total_token-1-position+longest_ngram
+    start_position = total_token - 1 - position + longest_ngram
+    k = min(k, total_token - start_position)
+    return origin_tokens[start_position:start_position + k]

From 61088be5df4e85e5a94de009a0f0679d7a347101 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 06:52:48 +0800
Subject: [PATCH 050/233] [CI/Build] Increase pooling tolerance to pass CI
 (#22844)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 tests/models/language/pooling/test_intfloat.py               | 2 +-
 tests/models/language/pooling/test_snowflake_arctic_embed.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py
index e48bdbe940be..6cae53a660ad 100644
--- a/tests/models/language/pooling/test_intfloat.py
+++ b/tests/models/language/pooling/test_intfloat.py
@@ -36,7 +36,7 @@
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index 585fa0e683da..c22c78592e53 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -46,7 +46,7 @@
 @pytest.mark.parametrize("model_info", MODELS)
 def test_embed_models_mteb(hf_runner, vllm_runner,
                            model_info: EmbedModelInfo) -> None:
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
 
 
 @pytest.mark.parametrize("model_info", MODELS)

From a2755cb6dd68c7dbeec4eeecfc07d48a2fe8a617 Mon Sep 17 00:00:00 2001
From: Will Eaton <wseaton@users.noreply.github.com>
Date: Wed, 13 Aug 2025 23:09:07 -0400
Subject: [PATCH 051/233] [CI][Entrypoints]: add filter to generation to filter
 out invalid tool calls (#22826)

Signed-off-by: Will Eaton <weaton@redhat.com>
---
 .../entrypoints/openai/test_openai_schema.py  | 48 ++++++++++++-------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 771119d04ea3..246bd014aa69 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
     op = context.operation
     assert op is not None
 
-    def no_file_type(case: schemathesis.models.Case):
+    def no_invalid_types(case: schemathesis.models.Case):
         """
-        This filter skips test cases for the `POST /tokenize` endpoint where the
-        HTTP request body uses `"type": "file"` in any message's content.
-        We expect these cases to fail because that type isn't implemented here
-        https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
+        This filter skips test cases with invalid data that schemathesis
+        incorrectly generates due to permissive schema configurations.
+        
+        1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in 
+           message content, which isn't implemented.
+        
+        2. Skips tool_calls with `"type": "custom"` which schemathesis 
+           incorrectly generates instead of the valid `"type": "function"`.
 
         Example test cases that are skipped:
         curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
             http://localhost:8000/tokenize
 
         curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
-            http://localhost:8000/tokenize
+            -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
+            http://localhost:8000/v1/chat/completions
         """  # noqa: E501
-        if (op.method.lower() == "post" and op.path == "/tokenize"
-                and hasattr(case, "body") and isinstance(case.body, dict)
+        if (hasattr(case, "body") and isinstance(case.body, dict)
                 and "messages" in case.body
                 and isinstance(case.body["messages"], list)
                 and len(case.body["messages"]) > 0):
+
             for message in case.body["messages"]:
                 if not isinstance(message, dict):
                     continue
-                content = message.get("content", [])
-                if not isinstance(content, list) or len(content) == 0:
-                    continue
-                if any(item.get("type") == "file" for item in content):
-                    return False
+
+                # Check for invalid file type in tokenize endpoint
+                if op.method.lower() == "post" and op.path == "/tokenize":
+                    content = message.get("content", [])
+                    if (isinstance(content, list) and len(content) > 0 and any(
+                            item.get("type") == "file" for item in content)):
+                        return False
+
+                # Check for invalid tool_calls with non-function types
+                tool_calls = message.get("tool_calls", [])
+                if isinstance(tool_calls, list):
+                    for tool_call in tool_calls:
+                        if isinstance(tool_call, dict):
+                            if tool_call.get("type") != "function":
+                                return False
+                            if "custom" in tool_call:
+                                return False
         return True
 
-    return strategy.filter(no_file_type)
+    return strategy.filter(no_invalid_types)
 
 
 @schema.parametrize()

From db3d9a83ed143908a79dd73699680948fb90dfb6 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Thu, 14 Aug 2025 05:09:30 +0200
Subject: [PATCH 052/233] [CI] Fix
 `tests/distributed/test_ca_buffer_sharing.py` (#22849)

Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 vllm/distributed/device_communicators/custom_all_reduce.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 7dd104a4fcc4..8dfb7959a510 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -297,7 +297,7 @@ def create_shared_buffer(size_in_bytes: int,
     @staticmethod
     def free_shared_buffer(pointers: list[int],
                            group: Optional[ProcessGroup] = None,
-                           rank: Optional[int] = 0) -> None:
+                           rank: Optional[int] = None) -> None:
         if rank is None:
             rank = dist.get_rank(group=group)
         if ops is not None:

From 062888c4c919ab55c3e91aabd604068266c886cf Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 14 Aug 2025 00:41:51 -0400
Subject: [PATCH 053/233] [CI] remove flaky v0 test (#22864)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
---
 tests/entrypoints/openai/test_default_mm_loras.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py
index 1fc87c8b42a7..372e9b1fecd4 100644
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -24,18 +24,7 @@
 
 
 @pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module", params=[False, True])
-def multimodal_server(request, monkeypatch_module):  # noqa: F811
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+def multimodal_server():  # noqa: F811
 
     args = [
         # use half precision for speed and memory savings in CI environment

From 929002e74d86b93423c26d51af972cf7b1a87a32 Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Thu, 14 Aug 2025 00:12:17 -0700
Subject: [PATCH 054/233]  vLLM Benchmark suite improvement (#22119)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
---
 .buildkite/nightly-benchmarks/README.md       |  32 ++--
 .../scripts/compare-json-results.py           | 175 ++++++++++++++++--
 .../convert-results-json-to-markdown.py       | 163 +++++++++++++++-
 .../scripts/run-performance-benchmarks.sh     |  93 ++++++----
 .../tests/latency-tests-cpu.json              |   4 +-
 .../tests/serving-tests-cpu-snc2.json         |  49 +++--
 .../tests/serving-tests-cpu-snc3.json         |  52 +++---
 .../tests/serving-tests-cpu.json              |  30 +--
 .../tests/throughput-tests-cpu.json           |   4 +-
 docs/contributing/benchmarks.md               |   2 +-
 10 files changed, 452 insertions(+), 152 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 3f2e2da39797..b39f9899a8f2 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 
-See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 
 ## Performance benchmark quick overview
 
@@ -138,28 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
 
 The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
 When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
 
-Here is an example using the script to compare result_a and result_b without detail test name.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
-
-|    | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|----------------------------------------|----------------------------------------|----------|
-| 0  | 142.633982                             | 156.526018                             | 1.097396 |
-| 1  | 241.620334                             | 294.018783                             | 1.216863 |
-| 2  | 218.298905                             | 262.664916                             | 1.203235 |
-| 3  | 242.743860                             | 299.816190                             | 1.235113 |
-
-Here is an example using the script to compare result_a and result_b with detail test name.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 
-|   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
-|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
-| 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
-| 1 | serving_llama8B_tp1_sharegpt_qps_16         | 241.620334                             | serving_llama8B_tp1_sharegpt_qps_16         | 294.018783                             | 1.216863 |
-| 2 | serving_llama8B_tp1_sharegpt_qps_4          | 218.298905                             | serving_llama8B_tp1_sharegpt_qps_4          | 262.664916                             | 1.203235 |
-| 3 | serving_llama8B_tp1_sharegpt_qps_inf        | 242.743860                             | serving_llama8B_tp1_sharegpt_qps_inf        | 299.816190                             | 1.235113 |
-| 4 | serving_llama8B_tp2_random_1024_128_qps_1   | 96.613390                              | serving_llama8B_tp4_random_1024_128_qps_1   | 108.404853                             | 1.122048 |
+|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
+| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
+| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
+
+A comparison diagram will be generated below the table.
+Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
+<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
 
 ## Nightly test details
 
diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 20c106234935..12c4ba6aa69a 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -1,24 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
+import json
+import os
 
 import pandas as pd
 
 
 def compare_data_columns(
-    files, name_column, data_column, drop_column, ignore_test_name=False
+    files, name_column, data_column, info_cols, drop_column, debug=False
 ):
     print("\ncompare_data_column: " + data_column)
     frames = []
+    raw_data_cols = []
     compare_frames = []
     for file in files:
         data_df = pd.read_json(file)
         serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
-        if ignore_test_name is False:
+        # Show all info columns in the first couple columns
+        if not frames:
+            for col in info_cols:
+                if col not in serving_df.columns:
+                    print(f"Skipping missing column: {col}")
+                    continue
+                frames.append(serving_df[col])
+        # only show test name under debug mode
+        if debug is True:
             serving_df = serving_df.rename(columns={name_column: file + "_name"})
             frames.append(serving_df[file + "_name"])
+
+        file = "/".join(file.split("/")[:-1])
         serving_df = serving_df.rename(columns={data_column: file})
         frames.append(serving_df[file])
+        raw_data_cols.append(file)
         compare_frames.append(serving_df[file])
         if len(compare_frames) >= 2:
             # Compare numbers among two files
@@ -27,7 +41,68 @@ def compare_data_columns(
             compare_frames.pop(1)
 
     concat_df = pd.concat(frames, axis=1)
-    return concat_df
+    print(raw_data_cols)
+    return concat_df, raw_data_cols
+
+
+def split_json_by_tp_pp(
+    input_file: str = "benchmark_results.json", output_root: str = "."
+) -> list[str]:
+    """
+    Split a benchmark JSON into separate folders by (TP Size, PP Size).
+
+    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
+    Returns: list of file paths written.
+    """
+    # Load JSON data into DataFrame
+    with open(input_file, encoding="utf-8") as f:
+        data = json.load(f)
+
+    # If the JSON is a dict with a list under common keys, use that list
+    if isinstance(data, dict):
+        for key in ("results", "serving_results", "benchmarks", "data"):
+            if isinstance(data.get(key), list):
+                data = data[key]
+                break
+
+    df = pd.DataFrame(data)
+
+    # Handle alias column names
+    rename_map = {
+        "tp_size": "TP Size",
+        "tensor_parallel_size": "TP Size",
+        "pp_size": "PP Size",
+        "pipeline_parallel_size": "PP Size",
+    }
+    df.rename(
+        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
+    )
+
+    # Ensure TP/PP columns exist (default to 1 if missing)
+    if "TP Size" not in df.columns:
+        df["TP Size"] = 1
+    if "PP Size" not in df.columns:
+        df["PP Size"] = 1
+
+    # make sure TP/PP are numeric ints with no NaN
+    df["TP Size"] = (
+        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
+    )
+    df["PP Size"] = (
+        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
+    )
+
+    # Split into separate folders
+    saved_paths: list[str] = []
+    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
+        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
+        os.makedirs(folder_name, exist_ok=True)
+        filepath = os.path.join(folder_name, "benchmark_results.json")
+        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
+        print(f"Saved: {filepath}")
+        saved_paths.append(filepath)
+
+    return saved_paths
 
 
 if __name__ == "__main__":
@@ -36,31 +111,105 @@ def compare_data_columns(
         "-f", "--file", action="append", type=str, help="input file name"
     )
     parser.add_argument(
-        "--ignore_test_name", action="store_true", help="ignore_test_name or not"
+        "--debug", action="store_true", help="show all information for debugging"
+    )
+    parser.add_argument(
+        "--plot",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="plot perf diagrams or not --no-plot --plot",
+    )
+    parser.add_argument(
+        "-x",
+        "--xaxis",
+        type=str,
+        default="# of max concurrency.",
+        help="column name to use as X Axis in comparision graph",
     )
     args = parser.parse_args()
-    files = args.file
-    print("comparing : " + ", ".join(files))
 
     drop_column = "P99"
     name_column = "Test name"
+    info_cols = [
+        "Model",
+        "Dataset Name",
+        "Input Len",
+        "Output Len",
+        "TP Size",
+        "PP Size",
+        "# of max concurrency.",
+        "qps",
+    ]
     data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
     html_msgs_for_data_cols = [
         "Compare Output Tokens /n",
         "Median TTFT /n",
         "Median TPOT /n",
     ]
-    ignore_test_name = args.ignore_test_name
+
+    if len(args.file) == 1:
+        files = split_json_by_tp_pp(args.file[0], output_root="splits")
+        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
+    else:
+        files = args.file
+    print("comparing : " + ", ".join(files))
+    debug = args.debug
+    plot = args.plot
+    # For Plot feature, assign y axis from one of info_cols
+    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
     with open("perf_comparison.html", "w") as text_file:
         for i in range(len(data_cols_to_compare)):
-            output_df = compare_data_columns(
+            output_df, raw_data_cols = compare_data_columns(
                 files,
                 name_column,
                 data_cols_to_compare[i],
+                info_cols,
                 drop_column,
-                ignore_test_name=ignore_test_name,
+                debug=debug,
             )
-            print(output_df)
-            html = output_df.to_html()
-            text_file.write(html_msgs_for_data_cols[i])
-            text_file.write(html)
+
+            # For Plot feature, insert y axis from one of info_cols
+            raw_data_cols.insert(0, info_cols[y_axis_index])
+
+            filtered_info_cols = info_cols[:-2]
+            existing_group_cols = [
+                c for c in filtered_info_cols if c in output_df.columns
+            ]
+            if not existing_group_cols:
+                raise ValueError(
+                    f"No valid group-by columns  "
+                    f"Expected subset: {filtered_info_cols}, "
+                    f"but DataFrame has: {list(output_df.columns)}"
+                )
+
+            output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
+            for name, group in output_groups:
+                html = group.to_html()
+                text_file.write(html_msgs_for_data_cols[i])
+                text_file.write(html)
+
+                if plot is True:
+                    import pandas as pd
+                    import plotly.express as px
+
+                    df = group[raw_data_cols]
+                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                    # Melt DataFrame for plotting
+                    df_melted = df_sorted.melt(
+                        id_vars=info_cols[y_axis_index],
+                        var_name="Configuration",
+                        value_name=data_cols_to_compare[i],
+                    )
+                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                    # Create Plotly line chart
+                    fig = px.line(
+                        df_melted,
+                        x=info_cols[y_axis_index],
+                        y=data_cols_to_compare[i],
+                        color="Configuration",
+                        title=title,
+                        markers=True,
+                    )
+                    # Export to HTML
+                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 554256b4bdb8..496ee6083abd 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,17 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import argparse
 import json
 import os
+import re
+import shlex
 from importlib import util
 from pathlib import Path
+from typing import Any
 
 import pandas as pd
 import psutil
 from tabulate import tabulate
 
-results_folder = Path("results/")
-
 # latency results and the keys that will be printed into markdown
 latency_results = []
 latency_column_mapping = {
@@ -42,14 +44,22 @@
 serving_results = []
 serving_column_mapping = {
     "test_name": "Test name",
+    "model_id": "Model",
+    "dataset_name": "Dataset Name",
+    "input_len": "Input Len",
+    "output_len": "Output Len",
+    "tp_size": "TP Size",
+    "pp_size": "PP Size",
+    "dtype": "dtype",
     "gpu_type": "GPU",
     "completed": "# of req.",
+    "qps": "qps",
     "max_concurrency": "# of max concurrency.",
     "request_throughput": "Tput (req/s)",
     "total_token_throughput": "Total Token Tput (tok/s)",
     "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
+    # "total_input_tokens": "Total input tokens",
+    # "total_output_tokens": "Total output tokens",
     "mean_ttft_ms": "Mean TTFT (ms)",
     "median_ttft_ms": "Median TTFT (ms)",
     "p99_ttft_ms": "P99 TTFT (ms)",
@@ -94,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"):
         bytes /= factor
 
 
+def _coerce(val: str) -> Any:
+    """Best-effort type coercion from string to Python types."""
+    low = val.lower()
+    if low == "null":
+        return None
+    if low == "true":
+        return True
+    if low == "false":
+        return False
+    # integers
+    if re.fullmatch(r"[+-]?\d+", val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+    # floats (keep 'inf'/'-inf'/'nan' as strings)
+    if re.fullmatch(r"[+-]?\d*\.\d+", val):
+        try:
+            return float(val)
+        except ValueError:
+            pass
+    return val
+
+
+def parse_client_command(cmd: str) -> dict[str, Any]:
+    """Parse the client_command shell string into {executable, script, args}."""
+    toks = shlex.split(cmd)
+    if len(toks) < 2:
+        raise ValueError("client_command must include an executable and a script")
+    executable, script = toks[0], toks[1]
+    args: dict[str, Any] = {}
+
+    i = 2
+    while i < len(toks):
+        t = toks[i]
+        if t.startswith("--"):
+            # --key=value or --key (value) or boolean flag
+            if "=" in t:
+                key, val = t.split("=", 1)
+                if key == "--metadata":
+                    md = {}
+                    if val:
+                        if "=" in val:
+                            k, v = val.split("=", 1)
+                            md[k] = _coerce(v)
+                        else:
+                            md[val] = True
+                    args[key] = md
+                else:
+                    args[key] = _coerce(val)
+                i += 1
+                continue
+
+            key = t
+
+            # Special: consume metadata k=v pairs until next --flag
+            if key == "--metadata":
+                i += 1
+                md = {}
+                while i < len(toks) and not toks[i].startswith("--"):
+                    pair = toks[i]
+                    if "=" in pair:
+                        k, v = pair.split("=", 1)
+                        md[k] = _coerce(v)
+                    else:
+                        md[pair] = True
+                    i += 1
+                args[key] = md
+                continue
+
+            # Standard: check if next token is a value (not a flag)
+            if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
+                args[key] = _coerce(toks[i + 1])
+                i += 2
+            else:
+                # lone flag -> True
+                args[key] = True
+                i += 1
+        else:
+            # unexpected positional; skip
+            i += 1
+
+    return {"executable": executable, "script": script, "args": args}
+
+
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-r",
+        "--result",
+        type=str,
+        default="results",
+        help="Folder name for benchmark output results.",
+    )
+    args = parser.parse_args()
+    results_folder = Path(args.result)
+    if not results_folder.exists():
+        raise FileNotFoundError(f"results folder does not exist: {results_folder}")
     # collect results
     for test_file in results_folder.glob("*.json"):
         with open(test_file) as f:
@@ -102,7 +209,6 @@ def get_size_with_unit(bytes, suffix="B"):
 
         if "serving" in str(test_file):
             # this result is generated via `vllm bench serve` command
-
             # attach the benchmarking command to raw_result
             try:
                 with open(test_file.with_suffix(".commands")) as f:
@@ -110,12 +216,44 @@ def get_size_with_unit(bytes, suffix="B"):
             except OSError as e:
                 print(e)
                 continue
-
+            # Parse Server Command Arg
+            out: dict[str, Any] = {
+                "server_command": parse_client_command(command["server_command"])
+            }
+            parse_args = [
+                "--tensor-parallel-size",
+                "--pipeline-parallel-size",
+                "--dtype",
+            ]
+            col_mapping = ["tp_size", "pp_size", "dtype"]
+            for index, arg in enumerate(parse_args):
+                if arg in out["server_command"]["args"]:
+                    raw_result.update(
+                        {col_mapping[index]: out["server_command"]["args"][arg]}
+                    )
+
+            # Parse Client Command Arg
+            out: dict[str, Any] = {
+                "client_command": parse_client_command(command["client_command"])
+            }
+            parse_args = [
+                "--dataset-name",
+                "--random-input-len",
+                "--random-output-len",
+                "--request-rate",
+            ]
+            col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
+
+            for index, arg in enumerate(parse_args):
+                if arg in out["client_command"]["args"]:
+                    raw_result.update(
+                        {col_mapping[index]: out["client_command"]["args"][arg]}
+                    )
+            # Add Server, Client command
             raw_result.update(command)
 
             # update the test name of this result
             raw_result.update({"test_name": test_file.stem})
-
             # add the result to raw_result
             serving_results.append(raw_result)
             continue
@@ -205,7 +343,10 @@ def get_size_with_unit(bytes, suffix="B"):
             columns=latency_column_mapping
         )
     if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+        valid_columns = [
+            col for col in serving_column_mapping if col in serving_results.columns
+        ]
+        serving_results = serving_results[valid_columns].rename(
             columns=serving_column_mapping
         )
     if not throughput_results.empty:
@@ -245,7 +386,9 @@ def get_size_with_unit(bytes, suffix="B"):
     )
 
     # document the result
-    with open(results_folder / "benchmark_results.md", "w") as f:
+    md_file = "benchmark_results.md"
+    json_file = "benchmark_results.json"
+    with open(results_folder / md_file, "w") as f:
         results = read_markdown(
             "../.buildkite/nightly-benchmarks/"
             + "performance-benchmarks-descriptions.md"
@@ -260,7 +403,7 @@ def get_size_with_unit(bytes, suffix="B"):
         f.write(results)
 
     # document benchmarking results in json
-    with open(results_folder / "benchmark_results.json", "w") as f:
+    with open(results_folder / json_file, "w") as f:
         results = (
             latency_results.to_dict(orient="records")
             + throughput_results.to_dict(orient="records")
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 2c57666a81aa..b1b7d2d77a44 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -194,9 +194,11 @@ run_latency_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
     else
@@ -261,9 +263,11 @@ run_throughput_tests() {
 
     # check if there is enough GPU to run the test
     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
     else
@@ -329,12 +333,21 @@ run_serving_tests() {
     qps_list=$(echo "$params" | jq -r '.qps_list')
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
+    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
+    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
+        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+        max_concurrency_list="[$num_prompts]"
+    fi
+    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
+    echo "Running over max concurrency list $max_concurrency_list"
 
     # check if there is enough resources to run the test
     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
         continue
       fi
     else
@@ -390,35 +403,39 @@ run_serving_tests() {
         echo "now qps is $qps"
       fi
 
-      new_test_name=$test_name"_qps_"$qps
-
-      # pass the tensor parallel size to the client so that it can be displayed
-      # on the benchmark dashboard
-      client_command="vllm bench serve \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        --metadata "tensor_parallel_size=$tp" \
-        $client_args $client_remote_args "
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      bash -c "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
-
+      # iterate over different max_concurrency
+      for max_concurrency in $max_concurrency_list; do
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+        echo " new test name $new_test_name"
+        # pass the tensor parallel size to the client so that it can be displayed
+        # on the benchmark dashboard
+        client_command="vllm bench serve \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --max-concurrency $max_concurrency \
+          --metadata "tensor_parallel_size=$tp" \
+          $client_args $client_remote_args "
+
+        echo "Running test case $test_name with qps $qps"
+        echo "Client command: $client_command"
+
+        bash -c "$client_command"
+
+        # record the benchmarking commands
+        jq_output=$(jq -n \
+          --arg server "$server_command" \
+          --arg client "$client_command" \
+          --arg gpu "$gpu_type" \
+          '{
+            server_command: $server,
+            client_command: $client,
+            gpu_type: $gpu
+          }')
+        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+      done
     done
 
     # clean up
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
index da93fdd1dbac..569117aae852 100644
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -6,7 +6,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "num_iters_warmup": 5,
@@ -20,7 +20,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "num_iters_warmup": 5,
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
index dd0e24edff98..2d88a0b30c4f 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@@ -1,7 +1,8 @@
 [
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -87,17 +88,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -106,7 +107,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -120,19 +121,19 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     },
     {
         "test_name": "serving_llama8B_tp2_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -141,7 +142,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -155,19 +156,19 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     },
     {
         "test_name": "serving_llama8B_tp4_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -176,7 +177,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -190,13 +191,11 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
-	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     }
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index f1bda65a7590..823abbaa99f8 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -1,7 +1,8 @@
 [
     {
         "test_name": "serving_llama8B_pp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_pp3_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
-        "test_name": "serving_llama8B_tp2pp6_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "test_name": "serving_llama8B_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
             "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
@@ -88,17 +89,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_pp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -107,7 +108,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -121,28 +122,28 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     },
     {
         "test_name": "serving_llama8B_pp3_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL:": 1,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -156,19 +157,19 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     },
     {
         "test_name": "serving_llama8B_tp2pp3_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -177,7 +178,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
             "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
@@ -192,13 +193,12 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
             "num_prompts": 1000
         }
     }
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index f150b9abeea4..e21c8df0a9fe 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -2,6 +2,7 @@
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -23,17 +24,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp2_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -55,17 +56,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp4_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -87,17 +88,17 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
             "num_prompts": 200
         }
     },
     {
         "test_name": "serving_llama8B_tp4_random_1024_128",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -106,7 +107,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -120,19 +121,19 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 100,
             "num_prompts": 100
         }
     },
     {
         "test_name": "serving_llama8B_pp6_random_1024_128",
         "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -141,7 +142,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "pipeline_parallel_size": 6,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@@ -155,13 +156,12 @@
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 100,
             "num_prompts": 100
         }
     }
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
index f159c30637d3..48c015aa8403 100644
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -6,7 +6,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -21,7 +21,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
diff --git a/docs/contributing/benchmarks.md b/docs/contributing/benchmarks.md
index 0ebd99ba5ae1..2bbed778f3c6 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -11,7 +11,7 @@ vLLM contains two sets of benchmarks:
 
 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
 
-The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
+The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
 
 More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
 

From 84229b430d15dfa4adaa1ae4fb5a35e20b972b06 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 14 Aug 2025 17:35:43 +0800
Subject: [PATCH 055/233] [Bugfix] Fix `PixtralHFImagePixelInputs` dynamic
 shape check (#22827)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/multimodal/test_tensor_schema.py | 2 +-
 vllm/model_executor/models/llava.py           | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index a4cb1a68833a..92390d8c2f7e 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -153,4 +153,4 @@ def validate_model_input(model):
                     if hasattr(model, method_name):
                         getattr(model, method_name)(**mm_kwargs)
 
-            vllm_model.apply_model(validate_model_input)
+            vllm_model.apply_model(validate_model_input)
\ No newline at end of file
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 89d2817b57e0..4927d6b62c6d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -72,8 +72,9 @@ class PixtralHFImagePixelInputs(TensorSchema):
     in which case the data is passed as a list instead of a batched tensor.
     """
     type: Literal["pixel_values_pixtral"] = "pixel_values_pixtral"
-    pixel_values: Annotated[Union[torch.Tensor, list[torch.Tensor]],
-                            TensorShape("bn", "c", "h", "w")]
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "c", "h", "w", dynamic_dims={"h", "w"})]
 
 
 class LlavaImageEmbeddingInputs(TensorSchema):

From 188855dd56421f93f1cc4aa05e075b5fb8eca73c Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 03:44:29 -0700
Subject: [PATCH 056/233] [BugFix] Threadsafe close async zmq sockets (#22877)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/utils/__init__.py        | 24 ++++++++++-
 vllm/v1/engine/core_client.py | 79 ++++++++++++++++++++++++-----------
 2 files changed, 77 insertions(+), 26 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 095829db8394..cae4eecc0dee 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -709,8 +709,28 @@ def cancel_tasks():
 
 
 def cancel_task_threadsafe(task: Task):
-    if task and not task.done() and not (loop := task.get_loop()).is_closed():
-        loop.call_soon_threadsafe(task.cancel)
+    if task and not task.done():
+        run_in_loop(task.get_loop(), task.cancel)
+
+
+def close_sockets(sockets: Sequence[Union[zmq.Socket, zmq.asyncio.Socket]]):
+    for sock in sockets:
+        if sock is not None:
+            sock.close(linger=0)
+
+
+def run_in_loop(loop: AbstractEventLoop, function: Callable, *args):
+    if in_loop(loop):
+        function(*args)
+    elif not loop.is_closed():
+        loop.call_soon_threadsafe(function, *args)
+
+
+def in_loop(event_loop: AbstractEventLoop) -> bool:
+    try:
+        return asyncio.get_running_loop() == event_loop
+    except RuntimeError:
+        return False
 
 
 def make_async(
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 05b4d7260896..5ffa555570a2 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -23,8 +23,8 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
-from vllm.utils import (cancel_task_threadsafe, get_open_port,
-                        get_open_zmq_inproc_path, make_zmq_socket)
+from vllm.utils import (close_sockets, get_open_port, get_open_zmq_inproc_path,
+                        in_loop, make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
@@ -317,7 +317,7 @@ class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
-    ctx: Union[zmq.Context]
+    ctx: zmq.Context
     # If CoreEngineProcManager, it manages local engines;
     # if CoreEngineActorManager, it manages all engines.
     engine_manager: Optional[Union[CoreEngineProcManager,
@@ -326,6 +326,8 @@ class BackgroundResources:
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     first_req_send_socket: Optional[zmq.asyncio.Socket] = None
+    first_req_rcv_socket: Optional[zmq.asyncio.Socket] = None
+    stats_update_socket: Optional[zmq.asyncio.Socket] = None
     output_queue_task: Optional[asyncio.Task] = None
     stats_update_task: Optional[asyncio.Task] = None
     shutdown_path: Optional[str] = None
@@ -343,23 +345,47 @@ def __call__(self):
         if self.coordinator is not None:
             self.coordinator.close()
 
-        cancel_task_threadsafe(self.output_queue_task)
-        cancel_task_threadsafe(self.stats_update_task)
+        if isinstance(self.output_socket, zmq.asyncio.Socket):
+            # Async case.
+            loop = self.output_socket._get_loop()
+            asyncio.get_running_loop()
+            sockets = (self.output_socket, self.input_socket,
+                       self.first_req_send_socket, self.first_req_rcv_socket,
+                       self.stats_update_socket)
+
+            tasks = (self.output_queue_task, self.stats_update_task)
+
+            def close_sockets_and_tasks():
+                close_sockets(sockets)
+                for task in tasks:
+                    if task is not None and not task.done():
+                        task.cancel()
+
+            if in_loop(loop):
+                close_sockets_and_tasks()
+            elif not loop.is_closed():
+                loop.call_soon_threadsafe(close_sockets_and_tasks)
+            else:
+                # Loop has been closed, try to clean up directly.
+                del tasks
+                del close_sockets_and_tasks
+                close_sockets(sockets)
+                del self.output_queue_task
+                del self.stats_update_task
+        else:
+            # Sync case.
 
-        # ZMQ context termination can hang if the sockets
-        # aren't explicitly closed first.
-        for socket in (self.output_socket, self.input_socket,
-                       self.first_req_send_socket):
-            if socket is not None:
-                socket.close(linger=0)
+            # ZMQ context termination can hang if the sockets
+            # aren't explicitly closed first.
+            close_sockets((self.output_socket, self.input_socket))
 
-        if self.shutdown_path is not None:
-            # We must ensure that the sync output socket is
-            # closed cleanly in its own thread.
-            with self.ctx.socket(zmq.PAIR) as shutdown_sender:
-                shutdown_sender.connect(self.shutdown_path)
-                # Send shutdown signal.
-                shutdown_sender.send(b'')
+            if self.shutdown_path is not None:
+                # We must ensure that the sync output socket is
+                # closed cleanly in its own thread.
+                with self.ctx.socket(zmq.PAIR) as shutdown_sender:
+                    shutdown_sender.connect(self.shutdown_path)
+                    # Send shutdown signal.
+                    shutdown_sender.send(b'')
 
     def validate_alive(self, frames: Sequence[zmq.Frame]):
         if len(frames) == 1 and (frames[0].buffer
@@ -969,14 +995,19 @@ def _ensure_stats_update_task(self):
                             self.engine_ranks_managed[-1] + 1)
 
         async def run_engine_stats_update_task():
-            with make_zmq_socket(self.ctx, self.stats_update_address,
-                                 zmq.XSUB) as socket, make_zmq_socket(
-                                     self.ctx,
-                                     self.first_req_sock_addr,
-                                     zmq.PAIR,
-                                     bind=False) as first_req_rcv_socket:
+            with (make_zmq_socket(self.ctx,
+                                  self.stats_update_address,
+                                  zmq.XSUB,
+                                  linger=0) as socket,
+                  make_zmq_socket(self.ctx,
+                                  self.first_req_sock_addr,
+                                  zmq.PAIR,
+                                  bind=False,
+                                  linger=0) as first_req_rcv_socket):
                 assert isinstance(socket, zmq.asyncio.Socket)
                 assert isinstance(first_req_rcv_socket, zmq.asyncio.Socket)
+                self.resources.stats_update_socket = socket
+                self.resources.first_req_rcv_socket = first_req_rcv_socket
                 # Send subscription message.
                 await socket.send(b'\x01')
 

From 62835404892f875c322ee9a7aa9aae002a146715 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 14 Aug 2025 12:03:49 +0100
Subject: [PATCH 057/233] Remove Phi 4 Flash configuration workaround (#22723)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/transformers_utils/config.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 02ea0814ddef..d8c964fb2a4a 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -449,23 +449,6 @@ def get_config(
                     raise e
         config = _maybe_remap_hf_config_attrs(config)
 
-        # Phi4Flash misuses this config as list[int]. Convert it to int and add
-        # the layer_types list[str] to make it HF compatible
-        if (config.model_type == "phi4flash"):
-            # TODO: Remove after the following PR is merged:
-            # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/6
-            if not hasattr(config, "layer_types"):
-                config.layer_types = [
-                    "sliding_attention" if i < config.num_hidden_layers // 2
-                    and i % 2 == 1 else "full_attention"
-                    for i in range(config.num_hidden_layers)
-                ]
-            # TODO: Remove after the following PR is merged:
-            # https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning/discussions/7
-            if isinstance(config.sliding_window, list):
-                config.sliding_window = next(
-                    filter(None, config.sliding_window), None)
-
     elif config_format == ConfigFormat.MISTRAL:
         # This function loads a params.json config which
         # should be used when loading models in mistral format

From 01e9a4b45a99c3330e89a4b6661fc9ab7738d053 Mon Sep 17 00:00:00 2001
From: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Date: Thu, 14 Aug 2025 18:04:18 +0700
Subject: [PATCH 058/233] [Bugfix] Add reset prefix cache for online serving
 (#22726)

Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/engine/async_llm_engine.py | 1 +
 vllm/v1/engine/async_llm.py     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index b6ee4105340a..73726eeab5fc 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1092,6 +1092,7 @@ async def reset_prefix_cache(self,
         self.engine.reset_prefix_cache(device)
 
     async def sleep(self, level: int = 1) -> None:
+        await self.reset_prefix_cache()
         self.engine.sleep(level)
 
     async def wake_up(self, tags: Optional[list[str]] = None) -> None:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a2706327914c..edc2e235c3c3 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -576,6 +576,7 @@ async def reset_prefix_cache(self,
         await self.engine_core.reset_prefix_cache_async()
 
     async def sleep(self, level: int = 1) -> None:
+        await self.reset_prefix_cache()
         await self.engine_core.sleep_async(level)
 
     async def wake_up(self, tags: Optional[list[str]] = None) -> None:

From a3328c2c7ad376ff7c38b5dd372208358070a319 Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Thu, 14 Aug 2025 13:06:13 +0200
Subject: [PATCH 059/233] [Doc] fix dead link (#22898)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 docs/getting_started/installation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index f6ecceb85d86..0ee680f5c688 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -18,7 +18,7 @@ vLLM supports the following hardware platforms:
 ## Hardware Plugins
 
 The backends below live **outside** the main `vllm` repository and follow the
-[Hardware-Pluggable RFC](../design/plugin_system.md).
+[Hardware-Pluggable RFC](../../design/plugin_system.md).
 
 | Accelerator | PyPI / package | Repository |
 |-------------|----------------|------------|

From 39ade99dd3478bc3e68131d99e70eecf908a59c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 14 Aug 2025 13:34:34 +0200
Subject: [PATCH 060/233] [CI] Re-enable transcriptions
 `test_long_audio_request` (#22890)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/entrypoints/openai/test_transcription_validation.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 28fd02171b95..e103bd206b54 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -80,9 +80,6 @@ async def test_bad_requests(mary_had_lamb):
 async def test_long_audio_request(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
-    if model_name.startswith("openai"):
-        return
-
     mary_had_lamb.seek(0)
     audio, sr = librosa.load(mary_had_lamb)
     # Add small silence after each audio for repeatability in the split process

From 6e0178e9ae464796a6d1f66eeee81d3861b321e3 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 14 Aug 2025 08:28:09 -0400
Subject: [PATCH 061/233] [Perf] Dont create unnecessary pooling params
 (#22876)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a03e860a91c7..8fb9641844fb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -341,13 +341,13 @@ def _init_model_kwargs(self, num_tokens: int):
         model_kwargs = dict[str, Any]()
         num_reqs = self.input_batch.num_reqs
 
-        pooling_params = self.input_batch.pooling_metadata.pooling_params
-
-        num_pooling_reqs = len(pooling_params)
+        num_pooling_reqs = len(self.input_batch.pooling_params)
 
         if num_pooling_reqs == 0:
             return model_kwargs
 
+        pooling_params = self.input_batch.pooling_metadata.pooling_params
+
         assert num_pooling_reqs == num_reqs
 
         token_type_id_requests = dict[int, Any]()

From 5c73a228ab7b18d59c2f7bc6f19d640b7c449e7a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 14 Aug 2025 20:28:50 +0800
Subject: [PATCH 062/233] [Model] Modify the gate implementation of glm4_moe
 (#22832)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md        |  2 +-
 vllm/model_executor/models/glm4_moe.py | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index dbbbc5122b80..a24fa4bcce33 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -615,7 +615,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | ✅︎ |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | | ✅︎ | ✅︎ |
+| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 131c042c3c2d..aff491f9596c 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
-                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -118,14 +117,15 @@ def __init__(
         if config.hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
                              "Only silu is supported for now.")
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.n_routed_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     params_dtype=torch.float32,
-                                     prefix=f"{prefix}.gate")
-
+        # NOTE In the transformers implementation, the gate isn't an nn.Linear,
+        # so we cannot use ReplicatedLinear here.
+        # See: https://github.com/huggingface/transformers/blob/v4.55.1/src/transformers/models/glm4_moe/modeling_glm4_moe.py#L260
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            dtype=torch.float32,
+        )
         self.gate.e_score_correction_bias = nn.Parameter(
             torch.empty(config.n_routed_experts, dtype=torch.float32))
 
@@ -181,7 +181,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         if self.n_shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
-        router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+        router_logits = self.gate(hidden_states.to(dtype=torch.float32))
         final_hidden_states = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits) * self.routed_scaling_factor

From d2f8a6a20d5c4e5646ac309be97fc37038e614ee Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 14 Aug 2025 23:09:27 +0800
Subject: [PATCH 063/233] [Bugfix] Replace custom Encoding class with
 BatchEncoding in MistralTokenizer (#22786)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 6ccc636efaf1..4dd8b2439b3f 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 
 import huggingface_hub
 import regex as re
 from huggingface_hub import HfApi, hf_hub_download
+from transformers.tokenization_utils_base import BatchEncoding
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_base import TokenizerBase
@@ -27,11 +27,6 @@
 logger = init_logger(__name__)
 
 
-@dataclass
-class Encoding:
-    input_ids: Union[list[int], list[list[int]]]
-
-
 def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
     # SEE: https://github.com/vllm-project/vllm/pull/9951
     # Credits go to: @gcalmettes
@@ -359,7 +354,7 @@ def __call__(
         # For str, single prompt text
         else:
             input_ids = self.encode_one(text, truncation, max_length)
-        return Encoding(input_ids=input_ids)
+        return BatchEncoding({"input_ids": input_ids})
 
     def get_vocab(self) -> dict[str, int]:
         # NB: the dictionary form of the vocabulary collapses token ids that map

From 25a3132d369a5c9d2cc4624799037dda18bb3c2c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 23:09:44 +0800
Subject: [PATCH 064/233] [Bugfix] Fix parsing of
 `--disable-mm-preprocessor-cache` (#22909)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c058001ceb97..dd1072da0844 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -711,7 +711,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--mm-processor-cache-gb",
             **multimodal_kwargs["mm_processor_cache_gb"])
         multimodal_group.add_argument("--disable-mm-preprocessor-cache",
-                                      type=bool,
+                                      action="store_true",
                                       deprecated=True)
         multimodal_group.add_argument(
             "--interleave-mm-strings",

From 30056c0a648b84682c0041622fda4a54fb0fc788 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 14 Aug 2025 20:01:16 +0200
Subject: [PATCH 065/233] [CI] [Hybrid]  Bump min transformers version for
 Bamba and Jamba (#22908)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 tests/models/registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index eb48c0f6a773..3efc9a99ea41 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -151,7 +151,7 @@ def check_available_online(
     "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
                                          trust_remote_code=True),
     "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
-                                        min_transformers_version="4.55.1",
+                                        min_transformers_version="4.56.0",
                                         extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                         {"1b": "bigscience/bloomz-1b1"}),
@@ -227,7 +227,7 @@ def check_available_online(
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        min_transformers_version="4.55.1",
+                                        min_transformers_version="4.56.0",
                                         extras={
                                             "tiny": "ai21labs/Jamba-tiny-dev",
                                             "random": "ai21labs/Jamba-tiny-random",  # noqa: E501

From 0b484cbefe3545c38583ce78888d5400c9693010 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Fri, 15 Aug 2025 02:23:22 +0800
Subject: [PATCH 066/233] [Kernel] [Quantization] Add MXFP4 and bias support
 for marlin kernel (#22428)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Animesh Jain <anijain@umich.edu>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: yan <yan.ma@intel.com>
Signed-off-by: Yan Ma <yan.ma@intel.com>
Signed-off-by: Xiao Liu <xiszishu@gmail.com>
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Signed-off-by: Haibin Lin <haibin.lin@bytedance.com>
Signed-off-by: David Ben-David <davidb@pliops.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: huangweixiao <huangweixiao@msh.team>
Signed-off-by: alyosha-swamy <raghav@arcee.ai>
Signed-off-by: Eric Hanley <ericehanley@google.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: CLFutureX <775523362@qq.com>
Signed-off-by: Linkun Chen <github@lkchen.net>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: tlipoca9 <tlipoca9@gmail.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Zhang Jason <ning.zhang2@amd.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: asafg <asafg@ai21.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk>
Signed-off-by: Lionel Villard <villard@us.ibm.com>
Signed-off-by: ycyaw66 <497410282@qq.com>
Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: Linkun <github@lkchen.net>
Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com>
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Signed-off-by: Junhao Li <junhao@ubicloud.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Signed-off-by: <zyy1102000@gmail.com>
Signed-off-by: Guy Stone <guys@spotify.com>
Signed-off-by: <yyweiss@gmail.com>
Signed-off-by: yyw <yyweiss@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Animesh Jain <jainanimesh2305@yahoo.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: XiongfeiWei <isaacwxf23@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: JartX <sagformas@gmail.com>
Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: tjtanaavllm <tunjian.tan@amd.com>
Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Xiao <xiszishu@gmail.com>
Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: Ning Xie <andy.xning@gmail.com>
Co-authored-by: H <linhaibin.eric@gmail.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: TankNee <nee@tanknee.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: ZiTian.Zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Abirdcfly <fp544037857@gmail.com>
Co-authored-by: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@meta.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Weixiao Huang <hwx.simle@gmail.com>
Co-authored-by: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com>
Co-authored-by: ericehanley <ericehanley@google.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com>
Co-authored-by: PiteXChen <44110731+CLFutureX@users.noreply.github.com>
Co-authored-by: lkchen <github@lkchen.net>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: tlipoca9 <160737620+tlipoca9@users.noreply.github.com>
Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Zhang Jason <ning.zhang2@amd.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Lain <siyuanf@nvidia.com>
Co-authored-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Co-authored-by: imning3 <hbning@pku.edu.cn>
Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: qscqesze <qingjun@minimaxi.com>
Co-authored-by: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com>
Co-authored-by: Lionel Villard <villard@us.ibm.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
Co-authored-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Co-authored-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Adrián García García <adrigarvk8@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: JaceyShao <65159281+JaceyShao@users.noreply.github.com>
Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Co-authored-by: Ricardo Decal <crypdick@users.noreply.github.com>
Co-authored-by: Andrew Chan <andrewkchan.akc@gmail.com>
Co-authored-by: fxmarty-amd <felmarty@amd.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Zhiyu <zhiyuc@nvidia.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
Co-authored-by: Junhao Li <streaver91@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Guy Stone <guys@spotify.com>
Co-authored-by: yyweiss <70619747+yyweiss@users.noreply.github.com>
Co-authored-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
---
 CMakeLists.txt                                |   7 +
 benchmarks/kernels/benchmark_machete.py       |   1 +
 csrc/core/scalar_type.hpp                     |   2 +
 csrc/moe/marlin_moe_wna16/generate_kernels.py |  15 ++
 csrc/moe/marlin_moe_wna16/kernel.h            |  26 +--
 csrc/moe/marlin_moe_wna16/marlin_template.h   | 137 ++++++++++---
 csrc/moe/marlin_moe_wna16/ops.cu              | 181 ++++++++++++------
 csrc/moe/torch_bindings.cpp                   |   3 +-
 csrc/quantization/gptq_marlin/dequant.h       |  23 ++-
 .../gptq_marlin/generate_kernels.py           |  17 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 162 +++++++++++-----
 csrc/quantization/gptq_marlin/kernel.h        |   5 +-
 .../gptq_marlin/marlin_template.h             | 139 +++++++++++---
 csrc/torch_bindings.cpp                       |   1 +
 tests/kernels/moe/test_moe.py                 | 175 +++++++++++++----
 .../kernels/quantization/test_marlin_gemm.py  |  95 +++++++--
 tests/kernels/utils.py                        |  21 +-
 vllm/_custom_ops.py                           |  18 +-
 vllm/envs.py                                  |  11 ++
 .../layers/fused_moe/fused_marlin_moe.py      |  30 ++-
 vllm/model_executor/layers/fused_moe/layer.py |  20 +-
 .../layers/quantization/awq_marlin.py         |  13 +-
 .../compressed_tensors_moe.py                 |   6 +
 .../model_executor/layers/quantization/fp8.py |   2 +
 .../layers/quantization/gptq_marlin.py        |  10 +-
 .../layers/quantization/hqq_marlin.py         |   9 +-
 .../kernels/mixed_precision/marlin.py         |   8 +-
 .../layers/quantization/modelopt.py           |   2 +
 .../layers/quantization/mxfp4.py              |  91 ++++++++-
 .../layers/quantization/utils/marlin_utils.py |  15 +-
 .../quantization/utils/marlin_utils_fp4.py    | 169 +++++++++++++---
 .../quantization/utils/marlin_utils_fp8.py    |  30 ++-
 .../layers/quantization/utils/mxfp4_utils.py  |   2 +-
 vllm/scalar_type.py                           |   2 +
 34 files changed, 1126 insertions(+), 322 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 093330caa4f9..5c1a200d1899 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -351,6 +351,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
+    set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
 
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
@@ -364,7 +366,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
+    set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
+
     message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
   else()
     message(STATUS "Not building Marlin kernels as no compatible archs found"
@@ -854,6 +859,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MOE_WNAA16_MARLIN_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+    set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
 
     list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
 
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index f73d0511e01f..975d10f2e92e 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -236,6 +236,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
             a=bt.a,
             c=None,
             b_q_weight=w_q,
+            b_bias=None,
             b_scales=w_s,
             global_scale=None,
             b_zeros=w_zp,
diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
index d0f85e23609b..68a8750f583b 100644
--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@@ -321,6 +321,8 @@ static inline constexpr auto kFE3M2f =
     ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
 static inline constexpr auto kFE4M3fn =
     ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
+static inline constexpr auto kFE8M0fnu =
+    ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
 static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
 static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
 static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index 49f33718a21e..698deb107cc0 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -20,6 +20,7 @@
 TEMPLATE = ("template __global__ void Marlin<"
             "{{scalar_t}}, "
             "{{w_type_id}}, "
+            "{{s_type_id}}, "
             "{{threads}}, "
             "{{thread_m_blocks}}, "
             "{{thread_n_blocks}}, "
@@ -77,6 +78,7 @@ def generate_new_kernels():
             if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
                 continue
             # nvfp4 only supports group_size == 16
+            # mxfp4 only supports group_size == 32
             if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                 continue
             # other quantization methods don't support group_size = 16
@@ -89,9 +91,22 @@ def generate_new_kernels():
 
             c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
 
+            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
+                s_type = "vllm::kFE4M3fn"
+            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
+                s_type = "vllm::kFE8M0fnu"
+                if dtype == "fp16":
+                    # we cannot safely dequantize e8m0 to fp16, so skip this
+                    continue
+            elif dtype == "fp16":
+                s_type = "vllm::kFloat16"
+            elif dtype == "bf16":
+                s_type = "vllm::kBFloat16"
+
             template_str = jinja2.Template(TEMPLATE).render(
                 scalar_t=c_dtype,
                 w_type_id=scalar_type + ".id()",
+                s_type_id=s_type + ".id()",
                 threads=threads,
                 thread_m_blocks=max(m_blocks, 1),
                 thread_n_blocks=n_blocks,
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
index 537282aba8c8..6190f7ee21ec 100644
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -7,23 +7,25 @@
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"
 
-#define MARLIN_KERNEL_PARAMS                                          \
-  const int4 *__restrict__ A, const int4 *__restrict__ B,             \
-      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                 \
-      const int4 *__restrict__ scales_ptr,                            \
-      const uint16_t *__restrict__ scale2_ptr,                        \
-      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
-      const int32_t *__restrict__ sorted_token_ids_ptr,               \
-      const int32_t *__restrict__ expert_ids_ptr,                     \
-      const int32_t *__restrict__ num_tokens_past_padded_ptr,         \
-      const float *__restrict__ topk_weights_ptr, int top_k,          \
-      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,  \
-      int prob_n, int prob_k, int *locks, bool use_atomic_add,        \
+#define MARLIN_KERNEL_PARAMS                                                  \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
+      const int4 *__restrict__ b_bias_ptr,                                    \
+      const int4 *__restrict__ scales_ptr,                                    \
+      const uint16_t *__restrict__ scale2_ptr,                                \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
+      const int32_t *__restrict__ sorted_token_ids_ptr,                       \
+      const int32_t *__restrict__ expert_ids_ptr,                             \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,                 \
+      const float *__restrict__ topk_weights_ptr, int top_k,                  \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
+      int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
       bool use_fp32_reduce, int max_shared_mem
 
 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index 8a913bb4a738..dd86a9a5ba6e 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -280,6 +280,7 @@ __device__ inline void wait_negative_and_add(int* lock) {
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -299,6 +300,7 @@ __global__ void Marlin(
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
     int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ b_bias_ptr,
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
@@ -318,8 +320,9 @@ __global__ void Marlin(
     int prob_n,             // output dimension n
     int prob_k,             // reduction dimension k
     int* locks,             // extra global storage for barrier synchronization
-    bool use_atomic_add,    // whether to use atomic add to reduce
-    bool use_fp32_reduce,   // whether to use fp32 global reduce
+    bool has_bias,
+    bool use_atomic_add,   // whether to use atomic add to reduce
+    bool use_fp32_reduce,  // whether to use fp32 global reduce
     int max_shared_mem) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
   // same size, which might involve multiple column "slices" (of width 16 *
@@ -342,12 +345,23 @@ __global__ void Marlin(
 
   extern __shared__ int4 sh[];
   static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
+  if constexpr (w_type == vllm::kFE2M1f) {
+    static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
+                  s_type == vllm::kFE8M0fnu && group_blocks == 2);
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    static_assert(s_type == vllm::kBFloat16);
+  } else if constexpr (std::is_same<scalar_t, half>::value) {
+    static_assert(s_type == vllm::kFloat16);
+  }
+
   constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
   constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
                                w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      !is_int_type ||
+      w_type == vllm::kFE4M3fn ||
+      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
       has_zp && !is_zp_float && !(w_type == vllm::kU8);
 
@@ -365,6 +379,7 @@ __global__ void Marlin(
   const int zp_expert_stride =
       is_zp_float ? prob_n * prob_k / group_size / 8
                   : prob_n * prob_k / group_size / (pack_factor * 4);
+  const int b_bias_expert_stride = prob_n / 8;
 
   // parallel: num valid moe blocks
   int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
@@ -475,7 +490,7 @@ __global__ void Marlin(
         for (int i = 0; i < 4; i++) {
           int idx = tid4 * 4 + i;
           idx = idx < block_num_valid_tokens ? idx : 0;
-          if constexpr (w_type == vllm::kFE2M1f) {
+          if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
             sh_block_topk_weights[idx] = __hmul2(
                 global_scale, Dtype::num2num2(Dtype::float2num(
                                   topk_weights_ptr[sh_block_sorted_ids[idx]])));
@@ -513,7 +528,7 @@ __global__ void Marlin(
       expert_id = expert_ids_ptr[block_id];
     }
 
-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
       uint16_t val = scale2_ptr[expert_id];
       global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
     }
@@ -526,6 +541,9 @@ __global__ void Marlin(
     if constexpr (has_act_order) {
       g_idx += (expert_id - old_expert_id) * prob_k;
     }
+    if (has_bias) {
+      b_bias_ptr += (expert_id - old_expert_id) * b_bias_expert_stride;
+    }
 
     read_moe_block_data(block_id);
   };
@@ -721,7 +739,7 @@ __global__ void Marlin(
 
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
 
   } else if constexpr (group_blocks != -1)
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
@@ -734,6 +752,18 @@ __global__ void Marlin(
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) % 4;
 
+  int bias_sh_rd;
+  if constexpr (m_block_size_8) {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) / 8;
+  } else {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) % 4;
+  }
+
+  int bias_sh_wr = threadIdx.x;
+  int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
+
   // Zero-points have the same read layout as the scales
   // (without column-wise case)
   constexpr int num_col_threads = 8;
@@ -793,7 +823,19 @@ __global__ void Marlin(
   constexpr int sh_b_size = stages * b_sh_stage;
   int4* sh_b = sh_new;
   int4* sh_red = sh_new;
-  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  constexpr int sh_size_b_red_min =
+      (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_size_b_red_max =
+      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_bias_size = (thread_n_blocks * 16 / 8);
+  constexpr int sh_b_red_bias_size =
+      sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size)
+          ? sh_size_b_red_max
+          : (sh_size_b_red_min + sh_bias_size);
+
+  int4* sh_bias = sh_new + sh_size_b_red_min;
+  int4* sh_g_idx = sh_new + sh_b_red_bias_size;
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
@@ -803,9 +845,9 @@ __global__ void Marlin(
   static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                 stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
-  constexpr int shm_size_used =
-      moe_block_size + stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
-      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int shm_size_used = moe_block_size +
+                                stages * (g_idx_stage + zp_sh_stage) +
+                                sh_s_size + sh_b_red_bias_size;
 
   // all remaining shared memory is used to cache A (input)
   // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
@@ -816,7 +858,8 @@ __global__ void Marlin(
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];                    // No act-order
+  FragS frag_s[2][4];  // No act-order
+  FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
   int frag_qzp[2][num_ints_per_thread];  // Zero-points
   FragZP frag_zp;                        // Zero-points in fp16
@@ -1065,10 +1108,15 @@ __global__ void Marlin(
           if constexpr (w_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else {
+          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
                     sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
+                                k % 2];
           }
         }
       }
@@ -1281,9 +1329,9 @@ __global__ void Marlin(
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2>(s_quant_0,
-                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2>(
+      dequant_fp8_scales<scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2, s_type_id>(
           s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
     }
 
@@ -1566,7 +1614,7 @@ __global__ void Marlin(
   // Write out the reduce final result in the correct layout. We only actually
   // reshuffle matrix fragments in this step, the reduction above is performed
   // in fragment layout.
-  auto write_result = [&]() {
+  auto write_result = [&](bool last) {
     int c_gl_stride = prob_n / 8;
     constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
     int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
@@ -1592,7 +1640,7 @@ __global__ void Marlin(
 
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
+    auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
       scalar_t2 res =
           Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
 
@@ -1601,14 +1649,27 @@ __global__ void Marlin(
       if constexpr (!has_act_order && group_blocks == -1 &&
                     w_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        res = __hmul2(res, s[0]);
+        scalar_t2 tmp_scale = s[0];
+        if constexpr (m_block_size_8) {
+          tmp_scale = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f) {
+      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         if (!mul_topk_weights) {
           res = __hmul2(res, global_scale);
         }
       }
+      if (has_bias && last) {
+        scalar_t2 tmp_bias = b_bias[0];
+        if constexpr (m_block_size_8) {
+          tmp_bias = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hadd2(res, tmp_bias);
+      }
 
       if constexpr (m_block_size_8) {
         ((scalar_t*)sh_red)[idx] = res.x;
@@ -1626,19 +1687,25 @@ __global__ void Marlin(
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
-                  frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
-                  frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
           } else {
             int wr = c_sh_wr + 8 * j;
             write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
             write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
           }
         }
         c_sh_wr += 16 * (4 * c_sh_stride);
@@ -1805,6 +1872,14 @@ __global__ void Marlin(
       }
 
       thread_block_reduce();
+
+      if (has_bias && last) {
+        __syncthreads();
+        cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd],
+                       threadIdx.x < 16 * thread_n_blocks / 8);
+        cp_async_fence();
+      }
+
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
         if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
@@ -1867,11 +1942,20 @@ __global__ void Marlin(
         }
         barrier_release(&locks[locks_off], last);
       }
+
+      if (has_bias && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
+        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        __syncthreads();
+      }
+
       if (use_atomic_add && slice_count > 1 && slice_idx != 0)
         wait_negative_and_add(&locks[locks_off]);
       if (last || use_atomic_add)
         // only the last block in a slice actually writes the result
-        write_result();
+        write_result(last);
       int old_slice_row = slice_row;
       slice_row = 0;
       slice_col_par++;
@@ -1904,6 +1988,7 @@ __global__ void Marlin(
           for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
         }
 
+        bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
         if constexpr (has_act_order) {
           slice_k_start = tb_k * slice_row;
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 2cff04f699b0..601e2aa6f991 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -51,8 +51,9 @@ __global__ void permute_cols_kernel(
 }  // namespace marlin
 
 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
@@ -212,7 +213,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
   // Get B size
   int tb_k = th_config.thread_k;
   int tb_n = th_config.thread_n;
-  int tb_m = thread_m_blocks * (m_block_size_8 ? 8 : 16);
+  int tb_m = thread_m_blocks * 16;
 
   // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
   // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
@@ -220,6 +221,11 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
   int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
   int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_bias_size = tb_n * 2;
+  int tmp_size =
+      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
+  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
+
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                             group_size, has_act_order, is_k_full);
@@ -234,8 +240,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
       sh_zp_size = sh_s_size / 2;
   }
 
-  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
-                   sh_zp_size + sh_g_idx_size + sh_block_meta_size;
+  int total_size = tmp_size + sh_a_size + sh_s_size + sh_zp_size +
+                   sh_g_idx_size + sh_block_meta_size;
 
   return total_size;
 }
@@ -270,20 +276,25 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
   int cache_size = get_kernel_cache_size(
       th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
       num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size <= max_shared_mem;
+  return cache_size + 512 <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
-             thread_n_blocks == THREAD_N_BLOCKS &&                           \
-             thread_k_blocks == THREAD_K_BLOCKS &&                           \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
-             is_zp_float == IS_ZP_FLOAT) {                                   \
-      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
-                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
-                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+             is_zp_float == IS_ZP_FLOAT) {                                     \
+      constexpr auto S_TYPE =                                                  \
+          W_TYPE == vllm::kFE2M1f                                              \
+              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
+              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
+                                                     : vllm::kBFloat16);       \
+      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
+                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
+                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
     }
 
   // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
@@ -335,30 +346,44 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
     _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
     _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
     _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
 
-  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+  #define BIGGROUP_GET_IF(W_TYPE)            \
+    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
+
+  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
     _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
 
-  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
     _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
 
-  #define FP4_GET_IF(W_TYPE)            \
-    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)
+  #define NVFP4_GET_IF(W_TYPE)            \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
 
-  #define BIGGROUP_GET_IF(W_TYPE)            \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
+  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF(W_TYPE)            \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
 
   // We currently have 4-bit models only with group_blocks == 4
   #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
@@ -408,12 +433,17 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
   COMMON_GET_IF(vllm::kU4B8)
   COMMON_GET_IF(vllm::kU8B128)
 
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
+  NVFP4_GET_IF(vllm::kFE2M1f)
 
-  FP4_GET_IF(vllm::kFE2M1f)
+  BIGGROUP_GET_IF(vllm::kFE4M3fn)
 
   ACT_GET_IF(vllm::kU4B8)
   ACT_GET_IF(vllm::kU8B128)
+  if (std::is_same<scalar_t, nv_bfloat16>::value) {
+    if (false) {
+    }
+    MXFP4_GET_IF(vllm::kFE2M1f)
+  }
 
   return kernel;
 }
@@ -482,16 +512,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 }
 
 template <typename scalar_t>
-void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
-               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
-               void* sorted_token_ids, void* expert_ids,
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
+               void* s, void* s2, void* zp, void* g_idx, void* perm,
+               void* a_tmp, void* sorted_token_ids, void* expert_ids,
                void* num_tokens_past_padded, void* topk_weights,
                int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
                int prob_m, int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& q_type, bool has_act_order,
-               bool is_k_full, bool has_zp, int num_groups, int group_size,
-               int dev, cudaStream_t stream, int thread_k, int thread_n,
-               int sms, bool use_atomic_add, bool use_fp32_reduce,
+               vllm::ScalarType const& q_type, bool has_bias,
+               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               int group_size, int dev, cudaStream_t stream, int thread_k,
+               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
                bool is_zp_float) {
   int thread_m_blocks = div_ceil(moe_block_size, 16);
   bool m_block_size_8 = moe_block_size == 8;
@@ -538,6 +568,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* bias_ptr = (const int4*)b_bias;
   const int4* s_ptr = (const int4*)s;
   const uint16_t* s2_ptr = (const uint16_t*)s2;
   const int4* zp_ptr = (const int4*)zp;
@@ -648,10 +679,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
   // avoid ">>>" being formatted to "> > >"
   // clang-format off
   kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
       sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
       topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
   // clang-format on
 }
 
@@ -659,7 +690,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
 
 torch::Tensor moe_wna16_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
@@ -766,7 +798,6 @@ torch::Tensor moe_wna16_marlin_gemm(
   num_groups = b_scales.size(1);
 
   torch::Tensor g_idx, perm, a_tmp;
-  ;
   if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
     g_idx = g_idx_or_none.value();
     perm = perm_or_none.value();
@@ -815,12 +846,24 @@ torch::Tensor moe_wna16_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
-                "global_scale can only be used for float4_e2m1f.");
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+                "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
-                "the global_scale parameter must be passed for float4_e2m1f.");
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+                "the global_scale parameter must be passed for nvfp4 format.");
+  }
+
+  bool has_bias = b_bias_or_none.has_value();
+  torch::Tensor b_bias;
+  if (has_bias) {
+    b_bias = b_bias_or_none.value();
+    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
+    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
+    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
+  } else {
+    b_bias = torch::empty({0}, options);
   }
 
   torch::Tensor b_zeros;
@@ -832,7 +875,6 @@ torch::Tensor moe_wna16_marlin_gemm(
     b_zeros = torch::empty({0}, options);
   }
   bool has_zp = b_zeros.size(-1) > 0;
-
   if (has_zp) {
     TORCH_CHECK(
         b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
@@ -890,41 +932,58 @@ torch::Tensor moe_wna16_marlin_gemm(
   if (a.scalar_type() == at::ScalarType::Half) {
     void* scales_ptr;
     if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
     } else {
       scales_ptr = b_scales.data_ptr<at::Half>();
     }
 
     MARLIN_NAMESPACE_NAME::marlin_mm<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
-        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
-        expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
-        topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
-        size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
-        is_k_full, has_zp, num_groups, group_size, dev,
+        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
+        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
+        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
+        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
+        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
+        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
         at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
         use_atomic_add, use_fp32_reduce, is_zp_float);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     void* scales_ptr;
     if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
     } else {
       scales_ptr = b_scales.data_ptr<at::BFloat16>();
     }
 
     MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
         global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
         g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
         sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
         num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
         moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
   } else {
     TORCH_CHECK(false,
                 "moe_wna16_marlin_gemm only supports bfloat16 and float16");
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d96e082f6ef1..7e49f68f6243 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -35,7 +35,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   m.def(
       "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
-      "Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_q_weight, Tensor? b_bias_or_none,"
+      "Tensor! b_scales, Tensor? global_scale, Tensor? "
       "b_zeros_or_none,"
       "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
       "Tensor sorted_token_ids,"
diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h
index ae0d6c0f2002..e8b0c302b202 100644
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -470,11 +470,12 @@ __device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
   frag_b[0] = __hmul2(frag_b[0], bias_reg);
 }
 
-template <typename scalar_t2>
+template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
 __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
 
 template <>
-__device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
+__device__ inline void dequant_fp8_scales<half2, vllm::kFE4M3fn.id()>(
+    int q, half2* frag_b) {
   int Out1 = (q & 0xFF00FF00) >> 1;
   ;
   q <<= 8;
@@ -486,8 +487,8 @@ __device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
 };
 
 template <>
-__device__ inline void dequant_fp8_scales<nv_bfloat162>(int q,
-                                                        nv_bfloat162* frag_b) {
+__device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE4M3fn.id()>(
+    int q, nv_bfloat162* frag_b) {
   constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
   constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
   constexpr int MASK = 0x7F007F00;
@@ -502,6 +503,20 @@ __device__ inline void dequant_fp8_scales<nv_bfloat162>(int q,
   frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
 }
 
+template <>
+__device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
+    int q, nv_bfloat162* frag_b) {
+  // In this conversion, 2 ** -127 in FP8E8M0 would become 0 in BF16,
+  // but we assume that such a extreme value would not occur in real models.
+  int Out1 = (q & 0xFF00FF00) >> 1;
+  q <<= 7;
+  int Out2 = q & 0x7F807F80;
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
 #endif
 
 }  // namespace MARLIN_NAMESPACE_NAME
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 18fb6c1a81f8..7576e0548abe 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -20,6 +20,7 @@
 TEMPLATE = ("template __global__ void Marlin<"
             "{{scalar_t}}, "
             "{{w_type_id}}, "
+            "{{s_type_id}}, "
             "{{threads}}, "
             "{{thread_m_blocks}}, "
             "{{thread_n_blocks}}, "
@@ -78,7 +79,8 @@ def generate_new_kernels():
             if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
                 continue
             # nvfp4 only supports group_size == 16
-            if scalar_type == "vllm::kFE2M1f" and group_blocks != 1:
+            # mxfp4 only supports group_size == 32
+            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                 continue
             # other quantization methods don't support group_size = 16
             if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
@@ -97,10 +99,23 @@ def generate_new_kernels():
                 # 4bit quantization and fp16
                 is_zp_float_list.append(True)
 
+            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
+                s_type = "vllm::kFE4M3fn"
+            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
+                s_type = "vllm::kFE8M0fnu"
+                if dtype == "fp16":
+                    # we cannot safely dequantize e8m0 to fp16, so skip this
+                    continue
+            elif dtype == "fp16":
+                s_type = "vllm::kFloat16"
+            elif dtype == "bf16":
+                s_type = "vllm::kBFloat16"
+
             for is_zp_float in is_zp_float_list:
                 template_str = jinja2.Template(TEMPLATE).render(
                     scalar_t=c_dtype,
                     w_type_id=scalar_type + ".id()",
+                    s_type_id=s_type + ".id()",
                     threads=threads,
                     thread_m_blocks=max(m_blocks, 1),
                     thread_n_blocks=n_blocks,
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 4a242f2050d5..cc30abcf0080 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -48,7 +48,8 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
 
 torch::Tensor gptq_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
@@ -187,7 +188,12 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
   int tb_m = thread_m_blocks * 16;
   int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
   int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
-  int sh_red_size = tb_m * (tb_n + 8);
+  int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_bias_size = tb_n * 2;
+  int tmp_size =
+      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
+  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
+
   int sh_s_size =
       get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                             group_size, has_act_order, is_k_full);
@@ -202,8 +208,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
       sh_zp_size = sh_s_size / 2;
   }
 
-  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
-                   sh_zp_size + sh_g_idx_size;
+  int total_size =
+      tmp_size + sh_a_size + sh_s_size + sh_zp_size + sh_g_idx_size;
 
   return total_size;
 }
@@ -237,20 +243,25 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
   int cache_size = get_kernel_cache_size(
       th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
       has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size <= max_shared_mem;
+  return cache_size + 512 <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
-             thread_n_blocks == THREAD_N_BLOCKS &&                           \
-             thread_k_blocks == THREAD_K_BLOCKS &&                           \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
-             is_zp_float == IS_ZP_FLOAT) {                                   \
-      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
-                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
-                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+             is_zp_float == IS_ZP_FLOAT) {                                     \
+      constexpr auto S_TYPE =                                                  \
+          W_TYPE == vllm::kFE2M1f                                              \
+              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
+              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
+                                                     : vllm::kBFloat16);       \
+      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
+                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
+                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
     }
 
   // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
@@ -315,22 +326,39 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
     BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
     BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
 
-  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
     _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
 
-  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
     _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
     _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
 
-  #define FP4_GET_IF(W_TYPE)            \
-    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    FP4_GET_IF_M234(W_TYPE, 4, 8, 128)
+  #define NVFP4_GET_IF(W_TYPE)            \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF(W_TYPE)            \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
 
   // We currently have 4-bit models only with group_blocks == 4
   #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
@@ -384,7 +412,7 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
   COMMON_GET_IF(vllm::kU4B8)
   COMMON_GET_IF(vllm::kU8B128)
 
-  FP4_GET_IF(vllm::kFE2M1f)
+  NVFP4_GET_IF(vllm::kFE2M1f)
 
   BIGGROUP_GET_IF(vllm::kFE4M3fn)
 
@@ -396,6 +424,11 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
     }
     FZP_GET_IF(vllm::kU4)
   }
+  if (std::is_same<scalar_t, nv_bfloat16>::value) {
+    if (false) {
+    }
+    MXFP4_GET_IF(vllm::kFE2M1f)
+  }
 
   return kernel;
 }
@@ -453,12 +486,12 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 }
 
 template <typename scalar_t>
-void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
-               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
-               int prob_m, int prob_n, int prob_k, int lda, void* workspace,
-               vllm::ScalarType const& q_type, bool has_act_order,
-               bool is_k_full, bool has_zp, int num_groups, int group_size,
-               int dev, cudaStream_t stream, int thread_k_init,
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
+               void* s, void* s2, void* zp, void* g_idx, void* perm,
+               void* a_tmp, int prob_m, int prob_n, int prob_k, int lda,
+               void* workspace, vllm::ScalarType const& q_type, bool has_bias,
+               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               int group_size, int dev, cudaStream_t stream, int thread_k_init,
                int thread_n_init, int sms, bool use_atomic_add,
                bool use_fp32_reduce, bool is_zp_float) {
   if (has_zp) {
@@ -503,6 +536,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* bias_ptr = (const int4*)b_bias;
   const int4* s_ptr = (const int4*)s;
   const uint16_t* s2_ptr = (const uint16_t*)s2;
   const int4* zp_ptr = (const int4*)zp;
@@ -623,8 +657,9 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
     // avoid ">>>" being formatted to "> > >"
     // clang-format off
     kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
-        A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, num_groups,
-        prob_m_split, prob_n, prob_k, lda, locks, part_use_atomic_add,
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr,
+        g_idx_ptr, num_groups,
+        prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
         use_fp32_reduce, max_shared_mem_new);
     // clang-format on
 
@@ -638,7 +673,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
 
 torch::Tensor gptq_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
@@ -785,12 +821,24 @@ torch::Tensor gptq_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
-                "global_scale can only be used for float4_e2m1f.");
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+                "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
-                "the global_scale parameter must be passed for float4_e2m1f.");
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+                "the global_scale parameter must be passed for nvfp4 format.");
+  }
+
+  bool has_bias = b_bias_or_none.has_value();
+  torch::Tensor b_bias;
+  if (has_bias) {
+    b_bias = b_bias_or_none.value();
+    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
+    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
+    TORCH_CHECK(b_bias.size(0) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.stride(0) == 1, "b_bias.stride(0) != 1");
+  } else {
+    b_bias = torch::empty({0}, options);
   }
 
   torch::Tensor b_zeros;
@@ -857,34 +905,50 @@ torch::Tensor gptq_marlin_gemm(
   if (a.scalar_type() == at::ScalarType::Half) {
     void* scales_ptr;
     if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
     } else {
       scales_ptr = b_scales.data_ptr<at::Half>();
     }
 
     marlin::marlin_mm<half>(
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
-        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
+        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order,
+        is_k_full, has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     void* scales_ptr;
     if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
     } else {
       scales_ptr = b_scales.data_ptr<at::BFloat16>();
     }
 
     marlin::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
         global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
         g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
         size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
-        has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
+        has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
         at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
         use_atomic_add, use_fp32_reduce, is_zp_float);
   } else {
diff --git a/csrc/quantization/gptq_marlin/kernel.h b/csrc/quantization/gptq_marlin/kernel.h
index f92056589d20..bb454f6aff22 100644
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@@ -10,15 +10,18 @@
 #define MARLIN_KERNEL_PARAMS                                                   \
   const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
       int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
+      const int4 *__restrict__ b_bias_ptr,                                     \
       const int4 *__restrict__ scales_ptr,                                     \
       const uint16_t *__restrict__ scale2_ptr,                                 \
       const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
       int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
-      bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem
+      bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
+      int max_shared_mem
 
 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index 008663385707..bfb0a3668f52 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -39,6 +39,7 @@ namespace MARLIN_NAMESPACE_NAME {
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -271,6 +272,7 @@ __device__ inline void wait_negative_and_add(int* lock) {
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -290,6 +292,7 @@ __global__ void Marlin(
     const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
     int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ b_bias_ptr,
     const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                           // (k/groupsize)xn
     const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
@@ -297,12 +300,13 @@ __global__ void Marlin(
     const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
                                       // (k/groupsize)x(n/pack_factor)
     const int* __restrict__ g_idx,    // int32 group indices of shape k
-    int num_groups,        // number of scale groups per output channel
-    int prob_m,            // batch dimension m
-    int prob_n,            // output dimension n
-    int prob_k,            // reduction dimension k
-    int lda,               // A.stride(0), equal to prob_k is A is contiguous
-    int* locks,            // extra global storage for barrier synchronization
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int lda,         // A.stride(0), equal to prob_k is A is contiguous
+    int* locks,      // extra global storage for barrier synchronization
+    bool has_bias,
     bool use_atomic_add,   // whether to use atomic add to reduce
     bool use_fp32_reduce,  // whether to use fp32 global reduce
     int max_shared_mem) {
@@ -326,18 +330,29 @@ __global__ void Marlin(
   using FragZP = typename ScalarType<scalar_t>::FragZP;
 
   static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
+  if constexpr (w_type == vllm::kFE2M1f) {
+    static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
+                  s_type == vllm::kFE8M0fnu && group_blocks == 2);
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    static_assert(s_type == vllm::kBFloat16);
+  } else if constexpr (std::is_same<scalar_t, half>::value) {
+    static_assert(s_type == vllm::kFloat16);
+  }
+
   constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
   constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
                                w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      !is_int_type ||
+      w_type == vllm::kFE4M3fn ||
+      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
       has_zp && !is_zp_float && !(w_type == vllm::kU8);
 
   scalar_t2 global_scale;
-
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+    // NVFP4 format requires global scale
     uint16_t val = scale2_ptr[0];
     global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
   }
@@ -589,7 +604,7 @@ __global__ void Marlin(
 
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
 
   } else if constexpr (group_blocks != -1)
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
@@ -602,6 +617,18 @@ __global__ void Marlin(
     s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
               (threadIdx.x % 32) % 4;
 
+  int bias_sh_rd;
+  if constexpr (m_block_size_8) {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) / 8;
+  } else {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) % 4;
+  }
+
+  int bias_sh_wr = threadIdx.x;
+  int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
+
   // Zero-points have the same read layout as the scales
   // (without column-wise case)
   constexpr int num_col_threads = 8;
@@ -670,7 +697,19 @@ __global__ void Marlin(
   constexpr int sh_b_size = stages * b_sh_stage;
   int4* sh_b = sh;
   int4* sh_red = sh;
-  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  constexpr int sh_size_b_red_min =
+      (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_size_b_red_max =
+      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_bias_size = (thread_n_blocks * 16 / 8);
+  constexpr int sh_b_red_bias_size =
+      sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size)
+          ? sh_size_b_red_max
+          : (sh_size_b_red_min + sh_bias_size);
+
+  int4* sh_bias = sh + sh_size_b_red_min;
+  int4* sh_g_idx = sh + sh_b_red_bias_size;
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
@@ -680,15 +719,13 @@ __global__ void Marlin(
   static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                 stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
-  // constexpr int shm_size_used =
-  //     stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
-  //     (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
   FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];                    // No act-order
+  FragS frag_s[2][4];  // No act-order
+  FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
   int frag_qzp[2][num_ints_per_thread];  // Zero-points
   FragZP frag_zp;                        // Zero-points in fp16
@@ -923,10 +960,15 @@ __global__ void Marlin(
           if constexpr (w_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else {
+          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
                     sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
+                                k % 2];
           }
         }
       }
@@ -1139,9 +1181,9 @@ __global__ void Marlin(
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2>(s_quant_0,
-                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2>(
+      dequant_fp8_scales<scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2, s_type_id>(
           s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
     }
 
@@ -1411,7 +1453,7 @@ __global__ void Marlin(
   // Write out the reduce final result in the correct layout. We only actually
   // reshuffle matrix fragments in this step, the reduction above is performed
   // in fragment layout.
-  auto write_result = [&]() {
+  auto write_result = [&](bool last) {
     int c_gl_stride = prob_n / 8;
     constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
     int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
@@ -1438,7 +1480,7 @@ __global__ void Marlin(
     int c_gl_wr_end = c_gl_stride * prob_m;
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
+    auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
       scalar_t2 res =
           Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
 
@@ -1447,12 +1489,25 @@ __global__ void Marlin(
       if constexpr (!has_act_order && group_blocks == -1 &&
                     w_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        res = __hmul2(res, s[0]);
+        scalar_t2 tmp_scale = s[0];
+        if constexpr (m_block_size_8) {
+          tmp_scale = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f) {
+      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         res = __hmul2(res, global_scale);
       }
+      if (has_bias && last) {
+        scalar_t2 tmp_bias = b_bias[0];
+        if constexpr (m_block_size_8) {
+          tmp_bias = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hadd2(res, tmp_bias);
+      }
 
       if constexpr (m_block_size_8) {
         ((scalar_t*)sh_red)[idx] = res.x;
@@ -1470,19 +1525,25 @@ __global__ void Marlin(
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
-                  frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
-                  frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
           } else {
             int wr = c_sh_wr + 8 * j;
             write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
             write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
             write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
           }
         }
         c_sh_wr += 16 * (4 * c_sh_stride);
@@ -1622,6 +1683,14 @@ __global__ void Marlin(
       }
 
       thread_block_reduce();
+
+      if (has_bias && last) {
+        __syncthreads();
+        cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd],
+                       threadIdx.x < 16 * thread_n_blocks / 8);
+        cp_async_fence();
+      }
+
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
         if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
@@ -1684,11 +1753,20 @@ __global__ void Marlin(
         }
         barrier_release(&locks[locks_off], last);
       }
+
+      if (has_bias && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
+        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        __syncthreads();
+      }
+
       if (use_atomic_add && slice_count > 1 && slice_idx != 0)
         wait_negative_and_add(&locks[locks_off]);
       if (last || use_atomic_add)
         // only the last block in a slice actually writes the result
-        write_result();
+        write_result(last);
       slice_row = 0;
       slice_col_par++;
       slice_col++;
@@ -1706,6 +1784,7 @@ __global__ void Marlin(
           for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
         }
 
+        bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
         if constexpr (has_act_order) {
           slice_k_start = tb_k * slice_row;
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 85b6abef00b0..8c207be083d8 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -326,6 +326,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // gptq_marlin Optimized Quantized GEMM for GPTQ.
   ops.def(
       "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
+      "Tensor? b_bias_or_none,"
       "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
       "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 49c097718e30..b82c74a42ab3 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -24,8 +24,10 @@
     fused_topk, modular_triton_fused_moe)
 from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
     fused_moe as iterative_moe)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    marlin_permute_bias)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    rand_marlin_weight_fp4_like)
+    rand_marlin_weight_mxfp4_like, rand_marlin_weight_nvfp4_like)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     marlin_quant_fp8_torch)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -476,8 +478,11 @@ def is_invalid(m, n, k, e, topk, ep_size, dtype, group_size, act_order,
         if quant_type == scalar_types.float8_e4m3fn and \
                 group_size not in [-1, 128]:
             return False
-        if quant_type == scalar_types.float4_e2m1f and group_size != 16:
-            return False
+        if quant_type == scalar_types.float4_e2m1f:
+            if group_size not in [16, 32]:
+                return False
+            if dtype == torch.float16 and group_size == 32:
+                return False
         if quant_type != scalar_types.float4_e2m1f and group_size == 16:
             return False
 
@@ -520,31 +525,6 @@ def test_fused_marlin_moe(
     torch.cuda.manual_seed(0)
     has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
 
-    if quant_type == scalar_types.float8_e4m3fn:
-        if group_size not in [-1, 128]:
-            return
-        if act_order:
-            return
-
-    # Filter act_order
-    if act_order:
-        if quant_type == scalar_types.float8_e4m3fn:
-            return
-        if group_size == -1:
-            return
-        if group_size in (k, n):
-            return
-        if has_zp:
-            return
-    else:
-        if not is_k_full:
-            return
-
-    if quant_type == scalar_types.float4_e2m1f and group_size != 16:
-        return
-    if quant_type != scalar_types.float4_e2m1f and group_size == 16:
-        return
-
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
@@ -569,13 +549,19 @@ def test_fused_marlin_moe(
 
     for i in range(w1.shape[0]):
         if quant_type == scalar_types.float4_e2m1f:
-            w_ref1, qweight1, scales1, global_scale1 = \
-                rand_marlin_weight_fp4_like(w1[i], group_size)
+            if group_size == 16:
+                w_ref1, qweight1, scales1, global_scale1 = \
+                    rand_marlin_weight_nvfp4_like(w1[i], group_size)
+            else:
+                w_ref1, qweight1, scales1 = \
+                    rand_marlin_weight_mxfp4_like(w1[i], group_size)
+                global_scale1 = None
 
             w_ref1_l.append(w_ref1.T)
             qweight1_l.append(qweight1)
             scales1_l.append(scales1)
-            global_scale1_l.append(global_scale1)
+            if global_scale1 is not None:
+                global_scale1_l.append(global_scale1)
         elif quant_type == scalar_types.float8_e4m3fn:
             w_ref1, qweight1, scales1 = marlin_quant_fp8_torch(
                 w1[i], group_size)
@@ -620,13 +606,19 @@ def test_fused_marlin_moe(
 
     for i in range(w2.shape[0]):
         if quant_type == scalar_types.float4_e2m1f:
-            w_ref2, qweight2, scales2, global_scale2 = \
-                rand_marlin_weight_fp4_like(w2[i], group_size)
+            if group_size == 16:
+                w_ref2, qweight2, scales2, global_scale2 = \
+                    rand_marlin_weight_nvfp4_like(w2[i], group_size)
+            else:
+                w_ref2, qweight2, scales2 = \
+                    rand_marlin_weight_mxfp4_like(w2[i], group_size)
+                global_scale2 = None
 
             w_ref2_l.append(w_ref2.T)
             qweight2_l.append(qweight2)
             scales2_l.append(scales2)
-            global_scale2_l.append(global_scale2)
+            if global_scale2 is not None:
+                global_scale2_l.append(global_scale2)
         elif quant_type == scalar_types.float8_e4m3fn:
             w_ref2, qweight2, scales2 = marlin_quant_fp8_torch(
                 w2[i], group_size)
@@ -677,6 +669,8 @@ def test_fused_marlin_moe(
         a,
         qweight1,
         qweight2,
+        None,
+        None,
         scales1,
         scales2,
         score,
@@ -698,6 +692,119 @@ def test_fused_marlin_moe(
     torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
 
 
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+@pytest.mark.parametrize("m", [1, 256])
+def test_fused_marlin_moe_with_bias(m):
+    torch.cuda.manual_seed(0)
+
+    e, topk = 32, 4
+    n, k = 2048, 2048
+    group_size = 128
+    act_order = False
+    is_k_full = True
+    quant_type = scalar_types.uint4b8
+    dtype = torch.half
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    b_bias1 = torch.randn((e, 2 * n), device="cuda", dtype=dtype) / 10
+    b_bias2 = torch.randn((e, k), device="cuda", dtype=dtype) / 10
+
+    b_bias1_l = []
+    w_ref1_l = []
+    qweight1_l = []
+    scales1_l = []
+    g_idx1_l = []
+    sort_indices1_l = []
+
+    for i in range(w1.shape[0]):
+        test_perm = torch.randperm(k)
+        w_ref1, qweight1, scales1, g_idx1, sort_indices1, _ = \
+            marlin_quantize(w1[i].transpose(1, 0), quant_type,
+                            group_size, act_order, test_perm)
+
+        w_ref1_l.append(w_ref1.T)
+        qweight1_l.append(qweight1)
+        scales1_l.append(scales1)
+        g_idx1_l.append(g_idx1)
+        sort_indices1_l.append(sort_indices1)
+        b_bias1_l.append(marlin_permute_bias(b_bias1[i]))
+
+    w_ref1 = stack_and_dev(w_ref1_l)
+    qweight1 = stack_and_dev(qweight1_l).contiguous()
+    scales1 = stack_and_dev(scales1_l)
+    global_scale1 = None
+    g_idx1 = stack_and_dev(g_idx1_l) if g_idx1_l else None
+    zeros1 = None
+    sort_indices1 = stack_and_dev(sort_indices1_l) if sort_indices1_l else None
+    marlin_bias1 = stack_and_dev(b_bias1_l) if b_bias1_l else None
+
+    b_bias2_l = []
+    w_ref2_l = []
+    qweight2_l = []
+    scales2_l = []
+    g_idx2_l = []
+    sort_indices2_l = []
+
+    for i in range(w2.shape[0]):
+        test_perm = torch.randperm(n)
+        w_ref2, qweight2, scales2, g_idx2, sort_indices2, _ = \
+            marlin_quantize(w2[i].transpose(1, 0), quant_type,
+                            group_size, act_order, test_perm)
+
+        w_ref2_l.append(w_ref2.T)
+        qweight2_l.append(qweight2)
+        scales2_l.append(scales2)
+        g_idx2_l.append(g_idx2)
+        sort_indices2_l.append(sort_indices2)
+        b_bias2_l.append(marlin_permute_bias(b_bias2[i]))
+
+    w_ref2 = stack_and_dev(w_ref2_l)
+    qweight2 = stack_and_dev(qweight2_l).contiguous()
+    scales2 = stack_and_dev(scales2_l)
+    global_scale2 = None
+    g_idx2 = stack_and_dev(g_idx2_l) if g_idx2_l else None
+    zeros2 = None
+    sort_indices2 = stack_and_dev(sort_indices2_l) if sort_indices2_l else None
+    marlin_bias2 = stack_and_dev(b_bias2_l) if b_bias2_l else None
+
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
+
+    with set_current_vllm_config(vllm_config):
+        torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, b_bias1,
+                                 b_bias2)
+
+    marlin_output = torch.ops.vllm.fused_marlin_moe(
+        a,
+        qweight1,
+        qweight2,
+        marlin_bias1,
+        marlin_bias2,
+        scales1,
+        scales2,
+        score,
+        topk_weights,
+        topk_ids,
+        global_num_experts=e,
+        expert_map=None,
+        global_scale1=global_scale1,
+        global_scale2=global_scale2,
+        g_idx1=g_idx1,
+        g_idx2=g_idx2,
+        sort_indices1=sort_indices1,
+        sort_indices2=sort_indices2,
+        w1_zeros=zeros1,
+        w2_zeros=zeros2,
+        quant_type_id=quant_type.id,
+        is_k_full=is_k_full)
+
+    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
+
+
 def test_moe_align_block_size_opcheck():
     num_experts = 4
     block_size = 4
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 92914bd5cbba..1bd6713ce7fb 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -19,10 +19,11 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
     MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
-    marlin_make_workspace_new, marlin_permute_scales,
+    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
     query_marlin_supported_quant_types)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_fp4_like)
+    FP4_MARLIN_SUPPORTED_GROUP_SIZES, rand_marlin_weight_mxfp4_like,
+    rand_marlin_weight_nvfp4_like)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     marlin_quant_fp8_torch)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
@@ -39,7 +40,7 @@
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
 USE_ATOMIC_ADD_OPTS = [False, True]
-USE_FP32_REDUCE_OPTS = [False, True]
+USE_FP32_REDUCE_OPTS = [True]
 
 MARLIN_K_CHUNKS = [128]
 MARLIN_N_CHUNKS = [64, 256]
@@ -202,17 +203,10 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
 @pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
 @pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
-def test_gptq_marlin_gemm(
-    k_chunk,
-    n_chunk,
-    quant_type,
-    group_size,
-    mnk_factors,
-    act_order,
-    is_k_full,
-    use_atomic_add,
-    use_fp32_reduce,
-):
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_gptq_marlin_gemm(k_chunk, n_chunk, quant_type, group_size,
+                          mnk_factors, act_order, is_k_full, use_atomic_add,
+                          use_fp32_reduce, dtype):
     m_factor, n_factor, k_factor = mnk_factors
     has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
 
@@ -231,14 +225,23 @@ def test_gptq_marlin_gemm(
     if size_k % group_size != 0:
         return
 
-    a_input = rand_data((size_m, size_k))
-    b_weight = rand_data((size_k, size_n))
+    a_input = rand_data((size_m, size_k), dtype)
+    b_weight = rand_data((size_k, size_n), dtype)
 
     if quant_type == scalar_types.float4_e2m1f:
-        if group_size != 16 or act_order:
+        if group_size not in [16, 32] or act_order:
             return
-        w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
-            b_weight.T, group_size)
+        if group_size == 32 and dtype == torch.float16:
+            return
+
+        if group_size == 16:
+            w_ref, marlin_q_w, marlin_s, marlin_s2 = \
+                rand_marlin_weight_nvfp4_like(b_weight.T, group_size)
+        else:
+            w_ref, marlin_q_w, marlin_s = \
+                rand_marlin_weight_mxfp4_like(b_weight.T, group_size)
+            marlin_s2 = None
+
         g_idx = None
         sort_indices = None
         marlin_zp = None
@@ -272,8 +275,8 @@ def test_gptq_marlin_gemm(
     workspace = marlin_make_workspace_new(w_ref.device)
 
     opcheck(torch.ops._C.gptq_marlin_gemm,
-            (a_input, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, g_idx,
-             sort_indices, workspace, quant_type.id, a_input.shape[0],
+            (a_input, None, marlin_q_w, None, marlin_s, marlin_s2, marlin_zp,
+             g_idx, sort_indices, workspace, quant_type.id, a_input.shape[0],
              b_weight.shape[1], a_input.shape[1], is_k_full, use_atomic_add,
              use_fp32_reduce, False),
             test_utils=DEFAULT_OPCHECK_TEST_UTILS)
@@ -282,6 +285,7 @@ def test_gptq_marlin_gemm(
         a_input,
         None,
         marlin_q_w,
+        None,
         marlin_s,
         marlin_s2,
         marlin_zp,
@@ -418,6 +422,7 @@ def test_hqq_marlin_gemm(
         a_input,
         None,
         marlin_w_q,
+        None,
         marlin_s,
         None,
         marlin_zp,
@@ -531,6 +536,7 @@ def test_marlin_gemm_subset_input():
         a_input,
         None,
         marlin_q_w,
+        None,
         marlin_s,
         None,
         marlin_zp,
@@ -555,6 +561,53 @@ def test_marlin_gemm_subset_input():
     assert max_diff < 0.04
 
 
+@pytest.mark.parametrize("size_m", [1, 256])
+def test_marlin_gemm_with_bias(size_m):
+    quant_type = scalar_types.uint4b8
+    group_size = 128
+
+    size_k, size_n = 1024, 2048
+    a_input = rand_data((size_m, size_k))
+    b_weight = rand_data((size_k, size_n))
+    b_bias = rand_data((size_n, )) * 10
+
+    marlin_bias = marlin_permute_bias(b_bias)
+
+    w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+        b_weight, quant_type, group_size, False)
+
+    marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
+    workspace = marlin_make_workspace_new(a_input.device)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        None,
+        marlin_q_w,
+        marlin_bias,
+        marlin_s,
+        None,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full=True,
+        use_atomic_add=False,
+        use_fp32_reduce=True,
+        is_zp_float=False,
+    )
+    output_ref = torch.matmul(a_input, w_ref) + b_bias.view(1, -1)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
 def test_marlin_gemm_opcheck():
     size_m = 2048
     size_n = 4096
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 2e8febbdcf26..fa4125840a01 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1064,6 +1064,8 @@ def torch_experts(
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
     global_num_experts: int = -1,
+    b_bias1: Optional[torch.Tensor] = None,
+    b_bias2: Optional[torch.Tensor] = None,
     expert_map: Optional[torch.Tensor] = None,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
@@ -1108,8 +1110,13 @@ def torch_experts(
         if mask.sum():
             if quant_dtype is None:
                 tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                if b_bias1 is not None:
+                    tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype)
                 tmp2 = SiluAndMul()(tmp1)
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
+                if b_bias2 is not None:
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
+                        tmp1.dtype)
             elif block_shape is not None:
                 # block quantized
                 assert (a_scale is not None and w1_scale is not None
@@ -1117,6 +1124,8 @@ def torch_experts(
                 tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
                                                 w1_scale[i], block_shape,
                                                 out.dtype)
+                if b_bias1 is not None:
+                    tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype)
                 tmp2 = SiluAndMul()(tmp1)
                 tmp2, b_scale = moe_kernel_quantize_input(
                     tmp2, a2_scale, quant_dtype, per_act_token_quant,
@@ -1125,6 +1134,9 @@ def torch_experts(
                 out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
                                                      w2_scale[i], block_shape,
                                                      out.dtype)
+                if b_bias2 is not None:
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
+                        tmp1.dtype)
             else:
                 assert (a_scale is not None and w1_scale is not None
                         and w2_scale is not None)
@@ -1133,6 +1145,8 @@ def torch_experts(
                 tmp1 = a[mask].to(f32) * scales
                 w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
                 tmp1 = (tmp1 @ w1_dq).to(out.dtype)
+                if b_bias1 is not None:
+                    tmp1 = tmp1 + b_bias1[i].view(1, -1).to(out.dtype)
 
                 tmp2 = SiluAndMul()(tmp1).to(out.dtype)
 
@@ -1144,6 +1158,9 @@ def torch_experts(
                 tmp2 = tmp2.to(f32) * b_scale
                 w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
                 out[mask] = (tmp2 @ w2_dq).to(out.dtype)
+                if b_bias2 is not None:
+                    out[mask] = out[mask] + b_bias2[i].view(1, -1).to(
+                        out.dtype)
 
     if apply_router_weights_on_input:
         return out
@@ -1157,12 +1174,14 @@ def torch_moe(a: torch.Tensor,
               w2: torch.Tensor,
               score: torch.Tensor,
               topk: int,
+              b_bias1: Optional[torch.Tensor] = None,
+              b_bias2: Optional[torch.Tensor] = None,
               global_num_experts: int = -1,
               expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
     return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts,
-                         expert_map)
+                         b_bias1, b_bias2, expert_map)
 
 
 def torch_moe_single(a, w, score, topk):
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 70605d3c5f52..a020b171e894 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -452,6 +452,7 @@ def _gptq_marlin_24_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor,
     def _gptq_marlin_gemm_fake(a: torch.Tensor,
                                c: Optional[torch.Tensor],
                                b_q_weight: torch.Tensor,
+                               b_bias: Optional[torch.Tensor],
                                b_scales: torch.Tensor,
                                global_scale: Optional[torch.Tensor],
                                b_zeros: Optional[torch.Tensor],
@@ -1048,6 +1049,7 @@ def awq_marlin_moe_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
 def gptq_marlin_gemm(a: torch.Tensor,
                      c: Optional[torch.Tensor],
                      b_q_weight: torch.Tensor,
+                     b_bias: Optional[torch.Tensor],
                      b_scales: torch.Tensor,
                      global_scale: Optional[torch.Tensor],
                      b_zeros: Optional[torch.Tensor],
@@ -1062,7 +1064,7 @@ def gptq_marlin_gemm(a: torch.Tensor,
                      use_atomic_add: bool = False,
                      use_fp32_reduce: bool = False,
                      is_zp_float: bool = False) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_gemm(a, c, b_q_weight, b_scales,
+    return torch.ops._C.gptq_marlin_gemm(a, c, b_q_weight, b_bias, b_scales,
                                          global_scale, b_zeros, g_idx, perm,
                                          workspace, b_q_type.id, size_m,
                                          size_n, size_k, is_k_full,
@@ -1540,7 +1542,9 @@ def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
 
 
 def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
-                          b_qweight: torch.Tensor, b_scales: torch.Tensor,
+                          b_qweight: torch.Tensor,
+                          b_bias: Optional[torch.Tensor],
+                          b_scales: torch.Tensor,
                           global_scale: Optional[torch.Tensor],
                           b_qzeros: Optional[torch.Tensor],
                           g_idx: Optional[torch.Tensor],
@@ -1556,11 +1560,11 @@ def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
                           use_fp32_reduce: bool,
                           is_zp_float: bool) -> torch.Tensor:
     return torch.ops._moe_C.moe_wna16_marlin_gemm(
-        input, output, b_qweight, b_scales, global_scale, b_qzeros, g_idx,
-        perm, workspace, sorted_token_ids, expert_ids, num_tokens_past_padded,
-        topk_weights, moe_block_size, top_k, mul_topk_weights, is_ep,
-        b_q_type.id, size_m, size_n, size_k, is_k_full, use_atomic_add,
-        use_fp32_reduce, is_zp_float)
+        input, output, b_qweight, b_bias, b_scales, global_scale, b_qzeros,
+        g_idx, perm, workspace, sorted_token_ids, expert_ids,
+        num_tokens_past_padded, topk_weights, moe_block_size, top_k,
+        mul_topk_weights, is_ep, b_q_type.id, size_m, size_n, size_k,
+        is_k_full, use_atomic_add, use_fp32_reduce, is_zp_float)
 
 
 if supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe"):
diff --git a/vllm/envs.py b/vllm/envs.py
index 5958a5cc0f29..44ce4ee3d8ac 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -123,6 +123,7 @@
     VLLM_MOE_DP_CHUNK_SIZE: int = 256
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
+    VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@@ -183,6 +184,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     return int(value)
 
 
+def maybe_convert_bool(value: Optional[str]) -> Optional[bool]:
+    if value is None:
+        return None
+    return bool(int(value))
+
+
 def get_vllm_port() -> Optional[int]:
     """Get the port from VLLM_PORT environment variable.
 
@@ -913,6 +920,10 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_MARLIN_USE_ATOMIC_ADD":
     lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
 
+    # Whether to use marlin kernel in mxfp4 quantization method
+    "VLLM_MXFP4_USE_MARLIN":
+    lambda: maybe_convert_bool(os.environ.get("VLLM_MXFP4_USE_MARLIN", None)),
+
     # Whether to turn on the outlines cache for V0
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 1988c73ba7e2..a49d41c18438 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -18,6 +18,8 @@
 def fused_marlin_moe(hidden_states: torch.Tensor,
                      w1: torch.Tensor,
                      w2: torch.Tensor,
+                     bias1: Optional[torch.Tensor],
+                     bias2: Optional[torch.Tensor],
                      w1_scale: torch.Tensor,
                      w2_scale: torch.Tensor,
                      gating_output: torch.Tensor,
@@ -26,6 +28,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
                      quant_type_id: int,
                      apply_router_weight_on_input: bool = False,
                      global_num_experts: int = -1,
+                     activation: Optional[str] = "silu",
                      expert_map: Optional[torch.Tensor] = None,
                      global_scale1: Optional[torch.Tensor] = None,
                      global_scale2: Optional[torch.Tensor] = None,
@@ -88,6 +91,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
     assert w2.is_contiguous(), "Expert weights2 must be contiguous"
     assert hidden_states.dtype in [torch.float16, torch.bfloat16]
     assert num_bits in [4, 8]
+    assert topk_weights.dtype == torch.float32
 
     M, K = hidden_states.shape
     E = w1.shape[0]
@@ -138,6 +142,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         hidden_states,
         intermediate_cache1,
         w1,
+        bias1,
         w1_scale,
         global_scale1,
         w1_zeros,
@@ -161,8 +166,28 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         use_fp32_reduce=True,
         is_zp_float=False)
 
-    torch.ops._C.silu_and_mul(intermediate_cache2,
-                              intermediate_cache1.view(-1, 2 * N))
+    if activation == "silu":
+        torch.ops._C.silu_and_mul(intermediate_cache2,
+                                  intermediate_cache1.view(-1, 2 * N))
+    elif activation == "swiglu_oai":
+        # NOTE: in gpt-oss, the gate_proj and up_proj is interleaved
+        # - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+        # - origin: gate, up = gate_up[..., :N], gate_up[..., N:]
+
+        @torch.compile(dynamic=True)
+        def swiglu_oai(gate_up):
+            alpha = 1.702
+            limit = 7.0
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=limit)
+            up = up.clamp(min=-limit, max=limit)
+            glu = gate * torch.sigmoid(gate * alpha)
+            return (up + 1) * glu
+
+        intermediate_cache2 = swiglu_oai(intermediate_cache1)
+    else:
+        raise ValueError(f"Unsupported activation: {activation}. "
+                         "Only silu and swiglu_oai activations are supported.")
 
     if expert_map is not None:
         intermediate_cache3.zero_()
@@ -171,6 +196,7 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
         intermediate_cache2,
         intermediate_cache3,
         w2,
+        bias2,
         w2_scale,
         global_scale2,
         w2_zeros,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index ddc02168e5c4..36e75825853e 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -36,7 +36,7 @@
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx,
-                        has_triton_kernels, is_torch_equal_or_newer, round_up)
+                        round_up)
 from vllm.utils.flashinfer import has_flashinfer
 
 if current_platform.is_cuda_alike():
@@ -751,19 +751,11 @@ def __init__(
         self.global_num_experts = num_experts + num_redundant_experts
 
         # we padding globally so EP buffer allocation works
-        if quant_config and quant_config.get_name() == "mxfp4":
-            if not current_platform.is_device_capability(100):
-                if not is_torch_equal_or_newer("2.8.0"):
-                    raise RuntimeError(
-                        "Mxfp4 on non-blackwell requires torch >= 2.8.0")
-                if not has_triton_kernels():
-                    raise NotImplementedError(
-                        "triton_kernels must be installed for "
-                        "mxfp4 on non-blackwell")
-            if (current_platform.is_rocm()
-                    or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                    or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
-                hidden_size = round_up(hidden_size, 256)
+        if (quant_config and quant_config.get_name() == "mxfp4"
+                and (current_platform.is_rocm()
+                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16)):
+            hidden_size = round_up(hidden_size, 256)
 
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 6cf02658a94c..ed7ffb21e85a 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -25,7 +25,7 @@
     apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
     check_marlin_supports_layer, check_moe_marlin_supports_layer,
     marlin_make_empty_g_idx, marlin_make_workspace_new,
-    marlin_moe_permute_scales, marlin_permute_scales,
+    marlin_moe_permute_scales, marlin_permute_bias, marlin_permute_scales,
     moe_awq_to_marlin_zero_points, verify_marlin_supported,
     verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -303,6 +303,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.g_idx = marlin_make_empty_g_idx(device)
         layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
 
+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -469,6 +472,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             num_bits=self.quant_config.weight_bits)
         replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
 
+        if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
+            layer.w13_bias.data = marlin_permute_bias(layer.w13_bias)
+
+        if hasattr(layer, "w2_bias") and layer.w2_bias is not None:
+            layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -513,6 +522,8 @@ def apply(
             x,
             layer.w13_qweight,
             layer.w2_qweight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
             layer.w13_scales,
             layer.w2_scales,
             router_logits,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c04f7c39a5f5..839942beaf40 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -324,6 +324,8 @@ def apply(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
+                None,
+                None,
                 layer.w13_weight_scale,
                 layer.w2_weight_scale,
                 router_logits,
@@ -795,6 +797,8 @@ def apply(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
+                None,
+                None,
                 layer.w13_weight_scale,
                 layer.w2_weight_scale,
                 router_logits,
@@ -1253,6 +1257,8 @@ def apply(
             x,
             layer.w13_weight_packed,
             layer.w2_weight_packed,
+            None,
+            None,
             layer.w13_weight_scale,
             layer.w2_weight_scale,
             router_logits,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 9577fa025b70..5e107c799b9f 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -983,6 +983,8 @@ def apply(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
+                None,
+                None,
                 layer.w13_weight_scale,
                 layer.w2_weight_scale,
                 router_logits,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9bed5e2e4889..3299221e3af3 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -24,7 +24,7 @@
     get_dynamic_override, get_linear_quant_method, override_config)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported, check_moe_marlin_supports_layer,
-    marlin_make_workspace_new, marlin_moe_permute_scales,
+    marlin_make_workspace_new, marlin_moe_permute_scales, marlin_permute_bias,
     marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
@@ -618,6 +618,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
+        if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
+            layer.w13_bias.data = marlin_permute_bias(layer.w13_bias)
+
+        if hasattr(layer, "w2_bias") and layer.w2_bias is not None:
+            layer.w2_bias.data = marlin_permute_bias(layer.w2_bias)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -662,6 +668,8 @@ def apply(
             x,
             layer.w13_qweight,
             layer.w2_qweight,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
             layer.w13_scales,
             layer.w2_scales,
             router_logits,
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index ee8a0e34b32e..8385ccac32a2 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -14,7 +14,7 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    marlin_make_empty_g_idx, marlin_permute_scales)
+    marlin_make_empty_g_idx, marlin_permute_bias, marlin_permute_scales)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
     MarlinWorkspace)
 from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
@@ -284,6 +284,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.marlin_zeros = marlin_zp
         layer.marlin_scales = marlin_s
 
+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -307,6 +310,7 @@ def apply(
             x,
             None,
             layer.marlin_qweight,
+            bias,
             scales,
             None,
             zeros,
@@ -326,7 +330,4 @@ def apply(
         if orig_type != torch.float16:
             marlin_out = marlin_out.to(orig_type)
 
-        if bias is not None:
-            marlin_out.add_(bias)
-
         return marlin_out
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index 73e0b17ea85a..5eb99383097b 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -9,8 +9,9 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
     check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
-    marlin_make_workspace_new, marlin_permute_scales, marlin_sort_g_idx,
-    marlin_zero_points, query_marlin_supported_quant_types, unpack_cols)
+    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
+    marlin_sort_g_idx, marlin_zero_points, query_marlin_supported_quant_types,
+    unpack_cols)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            permute_param_layout_)
 from vllm.platforms import current_platform
@@ -111,6 +112,9 @@ def transform_w_s(x):
         self._transform_param(layer, self.w_q_name, transform_w_q)
         self._transform_param(layer, self.w_s_name, transform_w_s)
 
+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index bed502226716..8868c623796a 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1330,6 +1330,8 @@ def apply(
                 x,
                 layer.w13_weight,
                 layer.w2_weight,
+                None,
+                None,
                 layer.w13_weight_scale,
                 layer.w2_weight_scale,
                 router_logits,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 03fbcf158338..dbe6c603c062 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -15,13 +15,17 @@
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+    prepare_moe_fp4_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     _can_support_mxfp4, _swizzle_mxfp4)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils import next_power_of_2, round_up
+from vllm.scalar_type import scalar_types
+from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer,
+                        next_power_of_2, round_up)
 
 if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
         or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
@@ -81,6 +85,21 @@ def __init__(self, moe: FusedMoEConfig):
         super().__init__()
         self.topk_indices_dtype = None
         self.moe = moe
+        self.use_marlin = self._should_use_marlin()
+
+    def _should_use_marlin(self):
+        if envs.VLLM_MXFP4_USE_MARLIN is not None:
+            return envs.VLLM_MXFP4_USE_MARLIN
+        if current_platform.is_cuda() and \
+                not current_platform.has_device_capability(100):
+            if not current_platform.is_device_capability(90):
+                # marlin kernel has better performance on ampere
+                return True
+            if not has_triton_kernels():
+                return True
+            if not is_torch_equal_or_newer("2.8.0"):
+                return True
+        return False
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -101,11 +120,29 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
 
         intermediate_size_per_partition_after_pad = \
             intermediate_size_per_partition
-        # pad the intermediate size to be a multiple of 2 * mxfp4_block
-        # for to hold non-uniform sharded tensor as well as swizzling
-        # other padding to increase performance
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if self.use_marlin:
+            # The moe marlin kernel requires that for each linear
+            # n % 256 == 0 and k % 128 == 0.
+            # In gate_up_proj:
+            #    n = 2 * intermediate_size_per_partition_after_pad
+            #    k = hidden_size
+            # In down_proj
+            #    n = hidden_size
+            #    k = intermediate_size_per_partition_after_pad
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 128)
+            hidden_size = round_up(hidden_size, 256)
+
+            layer.params_dtype = params_dtype
+            layer.num_experts = num_experts
+            layer.hidden_size = hidden_size
+            layer.intermediate_size_per_partition = \
+                intermediate_size_per_partition_after_pad
+        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+            # pad the intermediate size to be a multiple of 2 * mxfp4_block
+            # for to hold non-uniform sharded tensor as well as swizzling
+            # other padding to increase performance
             intermediate_size_per_partition_after_pad = round_up(
                 intermediate_size_per_partition, 256)
             hidden_size = round_up(hidden_size, 256)
@@ -191,8 +228,10 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         set_weight_attrs(w2_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer):
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if self.use_marlin:
+            prepare_moe_fp4_layer_for_marlin(layer)
+        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
             layer.gemm1_alpha = Parameter(torch.tensor(
                 [1.702] * self.num_experts, dtype=torch.float32).cuda(),
                                           requires_grad=False)
@@ -399,13 +438,45 @@ def apply(
         if enable_eplb:
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
+        if self.use_marlin:
+            topk_weights, topk_ids = FusedMoE.select_experts(
+                hidden_states=x,
+                router_logits=router_logits,
+                use_grouped_topk=use_grouped_topk,
+                top_k=top_k,
+                renormalize=renormalize,
+                topk_group=topk_group,
+                num_expert_group=num_expert_group,
+                custom_routing_function=custom_routing_function,
+                scoring_func=scoring_func,
+                e_score_correction_bias=e_score_correction_bias)
+
+            return torch.ops.vllm.fused_marlin_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_bias,
+                layer.w2_bias,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                router_logits,
+                topk_weights,
+                topk_ids,
+                global_scale1=None,
+                global_scale2=None,
+                quant_type_id=scalar_types.float4_e2m1f.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                activation=activation,
+                expert_map=expert_map)
+
         assert _can_support_mxfp4(
             use_grouped_topk, topk_group, num_expert_group, expert_map,
             custom_routing_function, e_score_correction_bias,
             apply_router_weight_on_input, scoring_func, activation,
             expert_load_view, logical_to_physical_map,
-            logical_replica_count), ("MXFP4 are not supported\
-                                      with this configuration.")
+            logical_replica_count), (
+                "MXFP4 are not supported with this configuration.")
 
         if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
                 or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 7540a1516fcb..02057b476c6e 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -261,6 +261,13 @@ def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
     return s
 
 
+def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
+    origin_shape = s.shape
+    _, scale_perm_single = get_scale_perms()
+    s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    return s.reshape(*origin_shape).contiguous()
+
+
 def marlin_moe_permute_scales(
     s: torch.Tensor,
     size_k: int,
@@ -410,6 +417,7 @@ def apply_gptq_marlin_linear(
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   None,
                                   weight,
+                                  bias,
                                   weight_scale,
                                   None,
                                   weight_zp,
@@ -425,9 +433,6 @@ def apply_gptq_marlin_linear(
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
     return output.reshape(out_shape)
 
 
@@ -456,6 +461,7 @@ def apply_awq_marlin_linear(
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   None,
                                   weight,
+                                  bias,
                                   weight_scale,
                                   None,
                                   weight_zp,
@@ -470,7 +476,4 @@ def apply_awq_marlin_linear(
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
     return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index ca10db69dc16..94ffdcd26ecd 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -8,8 +8,8 @@
 import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
-    should_use_atomic_add_reduce)
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
+    marlin_permute_scales, should_use_atomic_add_reduce)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -22,7 +22,7 @@ def is_fp4_marlin_supported():
     return current_platform.has_device_capability(80)
 
 
-def fp4_marlin_process_scales(marlin_scales):
+def nvfp4_marlin_process_scales(marlin_scales):
     if not (marlin_scales >= 0).all():
         logger.warning_once(
             "NVFP4 Marlin assumes the scales to be >=0, but has encountered "
@@ -56,7 +56,20 @@ def fp4_marlin_process_scales(marlin_scales):
     return marlin_scales
 
 
-def fp4_marlin_process_global_scale(global_scale):
+def mxfp4_marlin_process_scales(marlin_scales):
+    # 8 is the number of scale number using by one thread
+    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
+    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
+        marlin_scales.size(0) * 2, -1)
+
+    # fit the layout of fp8 dequantization
+    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+        marlin_scales.size(0), -1)
+    marlin_scales = marlin_scales.to(torch.float8_e8m0fnu)
+    return marlin_scales
+
+
+def nvfp4_marlin_process_global_scale(global_scale):
     assert global_scale.dtype in [torch.half, torch.bfloat16]
     fp4_exponent = 2
     if global_scale.dtype == torch.half:
@@ -73,7 +86,7 @@ def apply_fp4_marlin_linear(
         input: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
-        weight_scale_2: torch.Tensor,
+        weight_scale_2: Optional[torch.Tensor],
         workspace: torch.Tensor,
         size_n: int,
         size_k: int,
@@ -94,6 +107,7 @@ def apply_fp4_marlin_linear(
     output = ops.gptq_marlin_gemm(a=reshaped_x,
                                   c=None,
                                   b_q_weight=weight,
+                                  b_bias=bias,
                                   b_scales=weight_scale,
                                   global_scale=weight_scale_2,
                                   b_zeros=None,
@@ -107,9 +121,6 @@ def apply_fp4_marlin_linear(
                                   use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce)
 
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
     return output.reshape(out_shape)
 
 
@@ -120,6 +131,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads.")
 
+    is_nvfp4 = hasattr(layer, "weight_scale_2")
+    group_size = 16 if is_nvfp4 else 32
+
     part_size_n = layer.output_size_per_partition
     part_size_k = layer.input_size_per_partition
     param_dtype = layer.params_dtype
@@ -145,18 +159,35 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
 
     # WEIGHT SCALES
     # Permute scales
-    weight_scale = layer.weight_scale.T.to(param_dtype)
+    weight_scale = layer.weight_scale.T.contiguous()
+
+    if not is_nvfp4:
+        weight_scale = weight_scale.view(torch.float8_e8m0fnu)
+
+    weight_scale = weight_scale.to(param_dtype)
     weight_scale = marlin_permute_scales(s=weight_scale,
                                          size_k=part_size_k,
                                          size_n=part_size_n,
-                                         group_size=16)
-    weight_scale = fp4_marlin_process_scales(weight_scale)
-    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
-
-    weight_scale_2 = layer.weight_scale_2.to(param_dtype)
-    weight_scale_2 = fp4_marlin_process_global_scale(weight_scale_2)
-    layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
-                                              requires_grad=False)
+                                         group_size=group_size)
+
+    if is_nvfp4:
+        weight_scale = nvfp4_marlin_process_scales(weight_scale)
+        layer.weight_scale = torch.nn.Parameter(weight_scale,
+                                                requires_grad=False)
+
+        weight_scale_2 = layer.weight_scale_2.to(param_dtype)
+        weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2)
+        layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2,
+                                                  requires_grad=False)
+    else:
+        weight_scale = mxfp4_marlin_process_scales(weight_scale)
+        layer.weight_scale = torch.nn.Parameter(weight_scale,
+                                                requires_grad=False)
+
+    if hasattr(layer, "bias") and layer.bias is not None:
+        assert layer.bias.shape == (part_size_n, )
+        bias = marlin_permute_bias(layer.bias)
+        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
 
     return
 
@@ -168,6 +199,9 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         "be used leveraging the Marlin kernel. This may degrade "
         "performance for compute-heavy workloads.")
 
+    is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
+    group_size = 16 if is_nvfp4 else 32
+
     e = layer.num_experts
     k = layer.hidden_size
     n = layer.intermediate_size_per_partition
@@ -208,8 +242,13 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     # WEIGHT SCALES
     # Permute scales
     for name in ["w13", "w2"]:
-        scales = getattr(layer, name + "_weight_scale").to(param_dtype)
-        global_scale = getattr(layer, name + "_weight_scale_2").to(param_dtype)
+        scales = getattr(layer, name + "_weight_scale")
+        if not is_nvfp4:
+            scales = scales.view(torch.float8_e8m0fnu)
+        scales = scales.to(param_dtype)
+        if is_nvfp4:
+            global_scale = getattr(layer,
+                                   name + "_weight_scale_2").to(param_dtype)
 
         tensor_list = []
         if "w13" in name:
@@ -218,23 +257,47 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             size_n, size_k = k, n
 
         for i in range(e):
-            marlin_scales = marlin_permute_scales(s=scales[i].T,
+            scale = scales[i].T
+
+            marlin_scales = marlin_permute_scales(s=scale,
                                                   size_k=size_k,
                                                   size_n=size_n,
-                                                  group_size=16)
-            marlin_scales = fp4_marlin_process_scales(marlin_scales)
+                                                  group_size=group_size)
+            if is_nvfp4:
+                marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
+            else:
+                marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
         scales = torch.nn.Parameter(scales, requires_grad=False)
         setattr(layer, name + "_weight_scale", scales)
 
-        global_scale = fp4_marlin_process_global_scale(global_scale)
-        global_scale = torch.nn.Parameter(global_scale, requires_grad=False)
-        setattr(layer, name + "_weight_scale_2", global_scale)
+        if is_nvfp4:
+            global_scale = nvfp4_marlin_process_global_scale(global_scale)
+            global_scale = torch.nn.Parameter(global_scale,
+                                              requires_grad=False)
+            setattr(layer, name + "_weight_scale_2", global_scale)
+
+    # BIAS
+    # Permute bias
+    for name in ["w13_bias", "w2_bias"]:
+        if not hasattr(layer, name):
+            continue
+        bias = getattr(layer, name).to(param_dtype)
+
+        tensor_list = []
+        for i in range(e):
+            expert_bias = bias[i]
+
+            tensor_list.append(marlin_permute_bias(expert_bias))
+
+        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        bias = torch.nn.Parameter(bias, requires_grad=False)
+        setattr(layer, name, bias)
 
 
-def rand_marlin_weight_fp4_like(weight, group_size):
+def rand_marlin_weight_nvfp4_like(weight, group_size):
     assert group_size > 0
     size_n, size_k = weight.shape
     device = weight.device
@@ -276,8 +339,58 @@ def rand_marlin_weight_fp4_like(weight, group_size):
                                           size_k=size_k,
                                           size_n=size_n,
                                           group_size=group_size)
-    marlin_scales = fp4_marlin_process_scales(marlin_scales)
+    marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
 
-    global_scale = fp4_marlin_process_global_scale(global_scale)
+    global_scale = nvfp4_marlin_process_global_scale(global_scale)
 
     return weight_ref.T, marlin_qweight, marlin_scales, global_scale
+
+
+def rand_marlin_weight_mxfp4_like(weight, group_size):
+    assert group_size > 0
+    size_n, size_k = weight.shape
+    device = weight.device
+
+    scales = torch.randint(100,
+                           125, (size_n, size_k // group_size),
+                           dtype=torch.uint8,
+                           device=weight.device)
+    scales = scales.view(torch.float8_e8m0fnu)
+
+    fp4_weight = torch.randint(0,
+                               256, (size_n, size_k // 2),
+                               dtype=torch.uint8,
+                               device=weight.device)
+    fp4_weight_part_1 = ((fp4_weight & 0b10000000) |
+                         ((fp4_weight & 0b01110000) >> 2))
+    fp4_weight_part_1 = fp4_weight_part_1.view(torch.float8_e4m3fn)
+    fp4_weight_part_1 = fp4_weight_part_1.to(weight.dtype) * (2**6)
+
+    fp4_weight2 = fp4_weight << 4
+    fp4_weight_part_2 = ((fp4_weight2 & 0b10000000) |
+                         ((fp4_weight2 & 0b01110000) >> 2))
+    fp4_weight_part_2 = fp4_weight_part_2.view(torch.float8_e4m3fn)
+    fp4_weight_part_2 = fp4_weight_part_2.to(weight.dtype) * (2**6)
+
+    weight_ref = torch.cat(
+        [fp4_weight_part_2.unsqueeze(2),
+         fp4_weight_part_1.unsqueeze(2)], 2).view(size_n, size_k)
+    weight_ref = weight_ref * \
+        scales.repeat_interleave(group_size, 1).to(weight.dtype)
+
+    marlin_qweight = ops.gptq_marlin_repack(
+        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=4,
+    )
+
+    marlin_scales = marlin_permute_scales(s=scales.T.to(weight.dtype),
+                                          size_k=size_k,
+                                          size_n=size_n,
+                                          group_size=group_size)
+
+    marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+
+    return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 5372c49d9838..511e19545d5a 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -8,8 +8,8 @@
 import vllm._custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_scales,
-    should_use_atomic_add_reduce)
+    USE_FP32_REDUCE_DEFAULT, marlin_make_workspace_new, marlin_permute_bias,
+    marlin_permute_scales, should_use_atomic_add_reduce)
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
@@ -58,6 +58,7 @@ def apply_fp8_marlin_linear(
     output = ops.gptq_marlin_gemm(a=reshaped_x,
                                   c=None,
                                   b_q_weight=weight,
+                                  b_bias=bias,
                                   b_scales=weight_scale,
                                   global_scale=None,
                                   b_zeros=None,
@@ -71,9 +72,6 @@ def apply_fp8_marlin_linear(
                                   use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce)
 
-    if bias is not None:
-        output.add_(bias)  # In-place add
-
     return output.reshape(out_shape)
 
 
@@ -160,6 +158,11 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
     marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
     layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
 
+    if hasattr(layer, "bias") and layer.bias is not None:
+        assert layer.bias.shape == (part_size_n, )
+        bias = marlin_permute_bias(layer.bias)
+        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
+
 
 def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
                                      size_k_first: bool = True) -> None:
@@ -274,6 +277,23 @@ def prepare_moe_fp8_layer_for_marlin(layer: torch.nn.Module,
 
         setattr(layer, name + "_weight_scale", scales)
 
+    # BIAS
+    # Permute bias
+    for name in ["w13_bias", "w2_bias"]:
+        if not hasattr(layer, name):
+            continue
+        bias = getattr(layer, name).to(layer.orig_dtype)
+
+        tensor_list = []
+        for i in range(e):
+            expert_bias = bias[i]
+
+            tensor_list.append(marlin_permute_bias(expert_bias))
+
+        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        bias = torch.nn.Parameter(bias, requires_grad=False)
+        setattr(layer, name, bias)
+
 
 def pack_fp8_to_int32(fp8_tensor: torch.Tensor,
                       size_k_first: bool = True) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 95eabe149d89..deeb69bcad0e 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -61,7 +61,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
                        e_score_correction_bias: Optional[torch.Tensor] = None,
                        apply_router_weight_on_input: bool = False,
                        scoring_func: str = "softmax",
-                       activation: str = "silu",
+                       activation: str = "swiglu_oai",
                        expert_load_view: Optional[torch.Tensor] = None,
                        logical_to_physical_map: Optional[torch.Tensor] = None,
                        logical_replica_count: Optional[torch.Tensor] = None):
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 9060b55c79b0..6f11ab8e0300 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -327,6 +327,8 @@ class scalar_types:
     uint8 = ScalarType.uint(8, None)
     float8_e4m3fn = ScalarType.float_(4, 3, True, NanRepr.EXTD_RANGE_MAX_MIN)
     float8_e5m2 = ScalarType.float_IEEE754(5, 2)
+    float8_e8m0fnu = ScalarType(8, 0, False, 0, True,
+                                NanRepr.EXTD_RANGE_MAX_MIN)
     float16_e8m7 = ScalarType.float_IEEE754(8, 7)
     float16_e5m10 = ScalarType.float_IEEE754(5, 10)
 

From 6631e0be5b34e14eabec4cd5c1b033befa4819b2 Mon Sep 17 00:00:00 2001
From: Nir <bhr166@gmail.com>
Date: Thu, 14 Aug 2025 22:56:54 +0300
Subject: [PATCH 067/233] docs: update fastsafetensors usage instructions
 (#22891)

Signed-off-by: Nir Levy <bhr166@gmail.com>
---
 docs/models/extensions/fastsafetensor.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md
index 531d58690014..2a5a18102dc2 100644
--- a/docs/models/extensions/fastsafetensor.md
+++ b/docs/models/extensions/fastsafetensor.md
@@ -2,4 +2,5 @@ Loading Model weights with fastsafetensors
 ===================================================================
 
 Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
-For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
+
+To enable this feature, use the ``--load-format fastsafetensors`` command-line argument

From d3eedd76570471d72bbea753078996acdf836a7a Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 14 Aug 2025 15:59:16 -0400
Subject: [PATCH 068/233] [CI] Temporarily disable flaky test  (#22930)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 tests/v1/e2e/test_spec_decode.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index dde95fbe590b..7b3f45831279 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -162,6 +162,12 @@ def test_eagle_correctness(
     mm_enabled: bool,
     attn_backend: str,
 ):
+    if attn_backend == "TREE_ATTN":
+        # TODO: Fix this flaky test
+        pytest.skip(
+            "TREE_ATTN is flaky in the test disable for now until it can be "
+            "reolved (see https://github.com/vllm-project/vllm/issues/22922)")
+
     # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
     '''

From 9d16e612f79fcd315d315d282fc9227fdf143b52 Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Fri, 15 Aug 2025 04:03:55 +0800
Subject: [PATCH 069/233] [Kernel] Add nvfp4 gemm flashinfer backends (#22346)

Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 .../test_flashinfer_nvfp4_scaled_mm.py        | 139 ++++++++++++++++++
 .../quantization/test_nvfp4_scaled_mm.py      |   3 +
 vllm/envs.py                                  |   7 +
 .../schemes/compressed_tensors_w4a4_nvfp4.py  |  64 ++++++--
 .../layers/quantization/modelopt.py           |  84 ++++++++---
 vllm/model_executor/warmup/kernel_warmup.py   |  39 ++++-
 vllm/utils/flashinfer.py                      |  71 +++++++++
 vllm/v1/worker/gpu_worker.py                  |   4 +-
 9 files changed, 371 insertions(+), 41 deletions(-)
 create mode 100644 tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 740be2bc8770..942a8d3f9bfd 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -669,6 +669,7 @@ steps:
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     # Fusion
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
diff --git a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
new file mode 100644
index 000000000000..131086a5f703
--- /dev/null
+++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,
+                         convert_swizzled_to_linear, dequantize_nvfp4_to_dtype)
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ["cuda:0"]
+
+
+def get_ref_results(
+    a_fp4,
+    b_fp4,
+    a_sf,
+    b_sf,
+    a_global_scale,
+    b_global_scale,
+    m,
+    n,
+    dtype,
+    block_size,
+    device,
+):
+    _, m_k = a_fp4.shape
+    _, n_k = b_fp4.shape
+    assert m_k == n_k
+    a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
+                                           a_sf,
+                                           a_global_scale,
+                                           dtype=dtype,
+                                           device=device,
+                                           block_size=block_size)
+    b_in_dtype = dequantize_nvfp4_to_dtype(b_fp4,
+                                           b_sf,
+                                           b_global_scale,
+                                           dtype=dtype,
+                                           device=device,
+                                           block_size=block_size)
+    return torch.matmul(a_in_dtype, b_in_dtype.t())
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("backend", ["cutlass", "trtllm"])
+@pytest.mark.parametrize("autotune", [False, True])
+@torch.inference_mode()
+def test_flashinfer_nvfp4_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int, int],
+    seed: int,
+    device: str,
+    backend: str,
+    autotune: bool,
+) -> None:
+    if backend == "trtllm" and dtype == torch.float16:
+        pytest.skip(
+            "Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
+
+    current_platform.seed_everything(seed)
+    m, n, packed_k = shape
+    k = packed_k * 2
+    block_size = 16
+    a_dtype = torch.randn((m, k), dtype=dtype, device=device)
+    b_dtype = torch.randn((n, k), dtype=dtype, device=device)
+
+    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(a_dtype.flatten(), dim=-1)).to(torch.float32)
+    b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
+    alpha = 1.0 / (a_global_scale * b_global_scale)
+    # ops.scaled_fp4_quant returns swizzled scales, while weights
+    # from checkpoints are in linear scales.
+    # So instead of needing to swizzle for cutlass as in modelopt.py,
+    # we need to unswizzle for trtllm here.
+    a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
+    b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
+
+    # get_ref_results unswizzles the scales internally.
+    expected_out = get_ref_results(
+        a_fp4,
+        b_fp4,
+        a_scale_interleaved,
+        b_scale_interleaved,
+        a_global_scale,
+        b_global_scale,
+        m,
+        n,
+        dtype,
+        block_size,
+        device,
+    )
+
+    import flashinfer
+
+    if backend == "trtllm":
+        epilogue_tile_m = 128
+        b_fp4 = flashinfer.shuffle_matrix_a(b_fp4.view(torch.uint8),
+                                            epilogue_tile_m)
+
+        b_scale_interleaved = convert_swizzled_to_linear(
+            b_scale_interleaved, n, k, block_size)
+        b_scale_interleaved = (flashinfer.shuffle_matrix_sf_a(
+            b_scale_interleaved.view(torch.uint8), epilogue_tile_m).reshape(
+                b_scale_interleaved.shape).view(torch.float8_e4m3fn))
+
+    with flashinfer.autotune(autotune):
+        out = flashinfer_scaled_fp4_mm(
+            a_fp4,
+            b_fp4,
+            a_scale_interleaved,
+            b_scale_interleaved,
+            alpha,
+            dtype,
+            backend=backend,
+        )
+
+    torch.testing.assert_close(out,
+                               expected_out.to(dtype=dtype),
+                               atol=1e-1,
+                               rtol=1e-1)
diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
index 0b45c2298175..67e041f2b71c 100644
--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -65,9 +65,12 @@ def test_nvfp4_gemm(
     b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
                       torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
     alpha = 1. / (a_global_scale * b_global_scale)
+    # ops.scaled_fp4_quant returns swizzled scales, while weights
+    # from checkpoints are in linear scales.
     a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
     b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
 
+    # get_ref_results unswizzles the scales internally.
     expected_out = get_ref_results(a_fp4, b_fp4, a_scale_interleaved,
                                    b_scale_interleaved, a_global_scale,
                                    b_global_scale, m, n, dtype, block_size,
diff --git a/vllm/envs.py b/vllm/envs.py
index 44ce4ee3d8ac..088325c39a83 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1108,6 +1108,12 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_USE_TRTLLM_ATTENTION":
     lambda: os.getenv("VLLM_USE_TRTLLM_ATTENTION", None),
 
+    # If set to 1, force the use of TRTLLM FP4 GEMM backend in flashinfer.
+    # Otherwise, uses the first available of: flashinfer cutlass GEMM,
+    # vllm cutlass GEMM, marlin GEMM.
+    "VLLM_USE_TRTLLM_FP4_GEMM":
+    lambda: bool(int(os.getenv("VLLM_USE_TRTLLM_FP4_GEMM", "0"))),
+
     # Controls garbage collection during CUDA graph capture.
     # If set to 0 (default), enables GC freezing to speed up capture time.
     # If set to 1, allows GC to run during capture.
@@ -1215,6 +1221,7 @@ def factorize(name: str):
         "VLLM_DP_SIZE",
         "VLLM_USE_STANDALONE_COMPILE",
         "VLLM_FUSED_MOE_CHUNK_SIZE",
+        "VLLM_USE_TRTLLM_FP4_GEMM",
     ]
     for key in environment_variables_to_hash:
         if key in environment_variables:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index 8ba72162921a..63bfe565b121 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -15,6 +15,7 @@
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
+from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm, has_flashinfer
 
 logger = init_logger(__name__)
 
@@ -24,6 +25,13 @@
 class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
 
     def __init__(self):
+        if envs.VLLM_USE_TRTLLM_FP4_GEMM:
+            assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer"
+            self.backend = "flashinfer-trtllm"
+        elif has_flashinfer():
+            self.backend = "flashinfer-cutlass"
+        else:
+            self.backend = "cutlass"
         self.group_size = 16
 
     @classmethod
@@ -108,16 +116,36 @@ def process_weights_after_loading(self, layer) -> None:
             layer.weight_global_scale.max().to(torch.float32),
             requires_grad=False)
 
-        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
-        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
-                                                requires_grad=False)
-
-        # required by cutlass kernel; need Parameter, not ModelWeightParameter
-        layer.weight = Parameter(layer.weight_packed.data, requires_grad=False)
-
-        layer.alpha = Parameter(layer.input_global_scale *
-                                layer.weight_global_scale,
-                                requires_grad=False)
+        if self.backend == "flashinfer-trtllm":
+            # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
+            # FlashInfer provides nvfp4_quantize to quantize + shuffle the
+            # layout but we use our own quantization so we have to call
+            # shuffles ourselves.
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
+
+            weight = layer.weight_packed.data
+            weight_scale = layer.weight_scale.data
+
+            epilogue_tile_m = 128
+            weight = shuffle_matrix_a(weight.view(torch.uint8),
+                                      epilogue_tile_m)
+            weight_scale = (shuffle_matrix_sf_a(weight_scale.view(
+                torch.uint8), epilogue_tile_m).reshape(
+                    weight_scale.shape).view(torch.float8_e4m3fn))
+
+            layer.weight_scale_swizzled = Parameter(weight_scale,
+                                                    requires_grad=False)
+            layer.weight_packed = Parameter(weight, requires_grad=False)
+        else:
+            swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+            layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                    requires_grad=False)
+            layer.weight_packed = Parameter(layer.weight_packed.data,
+                                            requires_grad=False)
+
+        layer.alpha = Parameter(
+            1 / (layer.input_global_scale * layer.weight_global_scale),
+            requires_grad=False)
 
     def apply_weights(self,
                       layer: torch.nn.Module,
@@ -128,7 +156,7 @@ def apply_weights(self,
             out = run_nvfp4_emulations(
                 x=x,
                 input_global_scale=layer.input_global_scale,
-                weight=layer.weight,
+                weight=layer.weight_packed,
                 weight_scale_swizzled=layer.weight_scale_swizzled,
                 weight_global_scale=layer.weight_global_scale)
             if bias is not None:
@@ -136,14 +164,20 @@ def apply_weights(self,
             return out
 
         output_dtype = x.dtype
-        output_shape = [x.shape[0], layer.weight.shape[0]]
+        output_shape = [x.shape[0], layer.weight_packed.shape[0]]
 
         # quantize BF16 or FP16 to (FP4 and interleaved block scale)
         x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
 
-        out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
-                                    layer.weight_scale_swizzled,
-                                    1 / layer.alpha, output_dtype)
+        mm_args = (x_fp4, layer.weight_packed, x_blockscale,
+                   layer.weight_scale_swizzled, layer.alpha, output_dtype)
+        if self.backend == "flashinfer-trtllm":
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
+        elif self.backend == "flashinfer-cutlass":
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass")
+        else:
+            out = cutlass_scaled_fp4_mm(*mm_args)
+
         if bias is not None:
             out = out + bias
         return out.view(*output_shape)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 8868c623796a..8f9ca73bc505 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -38,7 +38,8 @@
                                            PerTensorScaleParameter)
 from vllm.scalar_type import scalar_types
 from vllm.utils import next_power_of_2
-from vllm.utils.flashinfer import has_flashinfer_moe
+from vllm.utils.flashinfer import (flashinfer_scaled_fp4_mm, has_flashinfer,
+                                   has_flashinfer_moe)
 
 logger = init_logger(__name__)
 
@@ -724,16 +725,20 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
-        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
-        self.use_marlin = False
 
-        if not self.cutlass_nvfp4_supported:
-            if is_fp4_marlin_supported():
-                self.use_marlin = True
-            else:
-                raise ValueError("Current platform does not support NVFP4"
-                                 " quantization. Please use Blackwell and"
-                                 " above.")
+        if envs.VLLM_USE_TRTLLM_FP4_GEMM:
+            assert has_flashinfer(), "TRTLLM FP4 GEMM requires FlashInfer"
+            self.backend = "flashinfer-trtllm"
+        elif has_flashinfer():
+            self.backend = "flashinfer-cutlass"
+        elif cutlass_fp4_supported():
+            self.backend = "cutlass"
+        elif is_fp4_marlin_supported():
+            self.backend = "marlin"
+        else:
+            raise ValueError("Current platform does not support NVFP4"
+                             " quantization. Please use Blackwell and"
+                             " above.")
 
     def create_weights(
         self,
@@ -815,17 +820,38 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # block_size = 16;
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
-        swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
 
-        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
-                                                requires_grad=False)
-        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+        if self.backend == "flashinfer-trtllm":
+            # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
+            # FlashInfer provides nvfp4_quantize to quantize + shuffle the
+            # layout but we use our own quantization so we have to call
+            # shuffles ourselves.
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
+
+            weight = layer.weight.data
+            weight_scale = layer.weight_scale.data
+
+            epilogue_tile_m = 128
+            weight = shuffle_matrix_a(weight.view(torch.uint8),
+                                      epilogue_tile_m)
+            weight_scale = (shuffle_matrix_sf_a(weight_scale.view(
+                torch.uint8), epilogue_tile_m).reshape(
+                    weight_scale.shape).view(torch.float8_e4m3fn))
+
+            layer.weight_scale_swizzled = Parameter(weight_scale,
+                                                    requires_grad=False)
+            layer.weight = Parameter(weight, requires_grad=False)
+        else:
+            swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
+            layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                    requires_grad=False)
+            layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
-        if self.use_marlin:
-            prepare_fp4_layer_for_marlin(layer)
-            del layer.alpha
-            del layer.input_scale
-            del layer.weight_scale_swizzled
+            if self.backend == "marlin":
+                prepare_fp4_layer_for_marlin(layer)
+                del layer.alpha
+                del layer.input_scale
+                del layer.weight_scale_swizzled
 
     def apply(
         self,
@@ -833,7 +859,7 @@ def apply(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if self.use_marlin:
+        if self.backend == "marlin":
             return apply_fp4_marlin_linear(
                 input=x,
                 weight=layer.weight,
@@ -859,9 +885,21 @@ def apply(
         assert (layer.weight_scale_swizzled.dtype == torch.float8_e4m3fn)
         assert (layer.alpha.dtype == torch.float32)
 
-        out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
-                                    layer.weight_scale_swizzled, layer.alpha,
-                                    output_dtype)
+        mm_args = (
+            x_fp4,
+            layer.weight,
+            x_blockscale,
+            layer.weight_scale_swizzled,
+            layer.alpha,
+            output_dtype,
+        )
+        if self.backend == "flashinfer-trtllm":
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend="trtllm")
+        elif self.backend == "flashinfer-cutlass":
+            out = flashinfer_scaled_fp4_mm(*mm_args, backend="cutlass")
+        else:
+            out = cutlass_scaled_fp4_mm(*mm_args)
+
         if bias is not None:
             out = out + bias
         return out.view(*output_shape)
diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 10f2dc0252a1..761172e4d361 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -5,16 +5,53 @@
 This is useful specifically for JIT'ed kernels as we don't want JIT'ing to
 happen during model execution.
 """
+from typing import TYPE_CHECKING
+
 import torch
 
 import vllm.envs as envs
 from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
+from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import is_deep_gemm_supported
+from vllm.utils.flashinfer import has_flashinfer
+
+if TYPE_CHECKING:
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+    from vllm.v1.worker.gpu_worker import Worker
 
 
-def kernel_warmup(model: torch.nn.Module, max_tokens: int):
+def kernel_warmup(worker: "Worker"):
+    # Deep GEMM warmup
     do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
                            and is_deep_gemm_supported()
                            and not envs.VLLM_SKIP_DEEP_GEMM_WARMUP)
     if do_deep_gemm_warmup:
+        model = worker.get_model()
+        max_tokens = worker.scheduler_config.max_num_batched_tokens
         deep_gemm_warmup(model, max_tokens)
+
+    # FlashInfer autotune for Blackwell (SM 10.0) GPUs
+    if has_flashinfer() and current_platform.is_device_capability(100):
+        flashinfer_autotune(worker.model_runner)
+
+
+def flashinfer_autotune(runner: "GPUModelRunner") -> None:
+    """
+    Autotune FlashInfer operations.
+    FlashInfer have many implementations for the same operation,
+    autotuning runs benchmarks for each implementation and stores
+    the results. The results are cached transparently and
+    future calls to FlashInfer will use the best implementation.
+    Without autotuning, FlashInfer will rely on heuristics, which may
+    be significantly slower.
+    """
+    from vllm.utils.flashinfer import autotune
+
+    with torch.inference_mode(), autotune():
+        # We skip EPLB here since we don't want to record dummy metrics
+        # When autotuning with number of tokens m, flashinfer will autotune
+        # operations for all number of tokens up to m.
+        # So we only need to run with the max number of tokens.
+        runner._dummy_run(runner.scheduler_config.max_num_batched_tokens,
+                          skip_eplb=True,
+                          is_profile=True)
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 6b23ed426806..0d7d4b694f07 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -14,6 +14,7 @@
 from typing import Any, Callable, NoReturn, Optional
 
 import requests
+import torch
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -193,6 +194,75 @@ def use_trtllm_attention(
         return use_trtllm
 
 
+if has_flashinfer():
+
+    @torch.library.custom_op(
+        "vllm::flashinfer_mm_fp4",
+        mutates_args=[],
+        device_types="cuda",
+    )
+    def flashinfer_mm_fp4(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+        g_scale: torch.Tensor,
+        dtype: torch.dtype,
+        backend: str,
+    ) -> torch.Tensor:
+        from flashinfer import mm_fp4 as flashinfer_mm_fp4_
+        return flashinfer_mm_fp4_(A,
+                                  B,
+                                  A_scale,
+                                  B_scale,
+                                  g_scale,
+                                  dtype,
+                                  block_size=16,
+                                  backend=backend)
+
+    @torch.library.register_fake("vllm::flashinfer_mm_fp4", )
+    def flashinfer_mm_fp4_fake(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+        g_scale: torch.Tensor,
+        dtype: torch.dtype,
+        backend: str,
+    ) -> torch.Tensor:
+        return torch.empty(A.shape[0],
+                           B.shape[1],
+                           dtype=dtype,
+                           device=A.device)
+
+
+def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
+                             block_scale_a: torch.Tensor,
+                             block_scale_b: torch.Tensor, alpha: torch.Tensor,
+                             out_dtype: torch.dtype,
+                             backend: str) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2
+    assert block_scale_a.ndim == 2 and block_scale_b.ndim == 2
+    assert a.stride(-1) == 1 and b.stride(-1) == 1
+    assert a.shape[1] == b.shape[1]
+    assert block_scale_a.shape[1] == a.shape[1] // 8
+    assert block_scale_b.shape[1] == b.shape[1] // 8
+
+    if backend == "cutlass":
+        block_scale_a = block_scale_a.view(torch.uint8)
+        block_scale_b = block_scale_b.view(torch.uint8)
+
+    return flashinfer_mm_fp4(
+        a,
+        b.t(),
+        block_scale_a,
+        block_scale_b.t(),
+        alpha,
+        out_dtype,
+        backend=backend,
+    )
+
+
 __all__ = [
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
@@ -205,4 +275,5 @@ def use_trtllm_attention(
     "has_flashinfer_cutlass_fused_moe",
     "has_nvidia_artifactory",
     "use_trtllm_attention",
+    "flashinfer_scaled_fp4_mm",
 ]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0ea23921a080..84f065f25f2e 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -310,6 +310,7 @@ def compile_or_warm_up_model(self) -> None:
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
+
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
 
@@ -340,8 +341,7 @@ def compile_or_warm_up_model(self) -> None:
                     hidden_states=last_hidden_states)
 
         # Warmup kernels used during model execution
-        kernel_warmup(self.get_model(),
-                      max_tokens=self.scheduler_config.max_num_batched_tokens)
+        kernel_warmup(self)
 
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.

From a3354b4fd4e817798ac89c1bc49ee4a35b87bfe0 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 14 Aug 2025 17:32:09 -0400
Subject: [PATCH 070/233] [Quantization]: Support compressed-tensors
 mixed-precision model loading (#22468)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../compressed_tensors/compressed_tensors.py  | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 69bced7c0b8e..637a84372990 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -192,7 +192,15 @@ def _quantization_scheme_map_from_config(
                         quant_config.get("weights"))
 
                 target_scheme_map[target]["input_activations"] = None
-                if is_activation_quantization_format(quant_format):
+                target_scheme_map[target]["format"] = quant_config.get(
+                    "format")
+                format = target_scheme_map[target].get("format")
+                # If no per-config format defined, use global format in config
+                act_quant_format = is_activation_quantization_format(
+                    format
+                ) if format is not None else is_activation_quantization_format(
+                    quant_format)
+                if act_quant_format:
                     input_activations = quant_config.get("input_activations")
                     # The only case where we have activation quant supported
                     # but no input_activations provided in the config
@@ -389,8 +397,10 @@ def _is_wNa16_group_channel(self, weight_quant: BaseModel,
         return (is_channel_group and input_quant_none and is_static)
 
     def _get_scheme_from_parts(
-            self, weight_quant: BaseModel,
-            input_quant: BaseModel) -> "CompressedTensorsScheme":
+            self,
+            weight_quant: BaseModel,
+            input_quant: BaseModel,
+            format: Optional[str] = None) -> "CompressedTensorsScheme":
         # Detect If Mixed Precision
         if self._is_fp4a16_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A16Fp4()
@@ -412,7 +422,11 @@ def _get_scheme_from_parts(
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder)
 
-        if is_activation_quantization_format(self.quant_format):
+        act_quant_format = is_activation_quantization_format(
+            format
+        ) if format is not None else is_activation_quantization_format(
+            self.quant_format)
+        if act_quant_format:
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
                 if cutlass_fp4_supported(
                 ) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
@@ -507,6 +521,7 @@ def get_scheme(self,
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
+            format = scheme_dict.get("format")
 
         # Find the sparsity scheme of the layer
         # assume that fused layers inerhit first component's sparsity scheme
@@ -547,7 +562,7 @@ def get_scheme(self,
             scheme = self._get_scheme_from_parts(  # type: ignore
                 weight_quant=weight_quant,
                 input_quant=input_quant,
-            )
+                format=format)
 
         # Raise error if device does not support the scheme
         # (e.g. fp8 needs ada lovelace)

From 796195bd47e9f10634fe192a92eeb062dde0ca19 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 14:49:02 -0700
Subject: [PATCH 071/233] [Core] Return final response for aborted requests
 from `AsyncLLM.generate` (#22283)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_async_llm.py  | 87 ++++++++++++++++++++++++++++++
 vllm/v1/engine/output_processor.py | 33 +++++++-----
 2 files changed, 107 insertions(+), 13 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 21694491dd73..484640233f52 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -13,6 +13,7 @@
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import PromptType
+from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.utils import set_default_torch_num_threads
@@ -398,3 +399,89 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
 
         # Test 3: Verify healthy engine still works after mock
         await engine.check_health()
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_abort_final_output(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+):
+    """Test that abort() returns a final output with correct information."""
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        request_id = "test-abort-final-output"
+
+        # Start a long-running request
+        sampling_params = SamplingParams(
+            max_tokens=3000,  # Long enough to allow abort
+            ignore_eos=True,
+            output_kind=output_kind,
+            temperature=0.5,
+            seed=42,
+        )
+
+        outputs: list[RequestOutput] = []
+        generated = asyncio.create_task(
+            collect_outputs(engine, request_id, TEXT_PROMPT, sampling_params,
+                            outputs))
+
+        # Let it generate some tokens
+        await asyncio.sleep(0.5)
+
+        # Abort the request
+        await engine.abort(request_id)
+
+        # Wait for generation to complete and return final output
+        final_output = await generated
+
+        # Verify we got a final output
+        assert final_output is not None
+        assert final_output.finished
+        assert len(final_output.outputs) == 1
+
+        assert final_output.outputs[0].finish_reason == "abort"
+        assert final_output.outputs[0].stop_reason is None
+
+        # Verify num_cached_tokens is set correctly
+        assert hasattr(final_output, 'num_cached_tokens')
+        assert final_output.num_cached_tokens >= 0
+
+        # If we got intermediate outputs, verify they are consistent
+        if output_kind == RequestOutputKind.DELTA:
+            # For DELTA, sum all intermediate tokens should <= final tokens
+            token_count = sum(
+                len(output.outputs[0].token_ids) for output in outputs)
+            assert token_count > 0
+            assert len(final_output.outputs[0].token_ids) == 0
+        else:
+            # For FINAL_ONLY, we should only get the final output
+            assert len(outputs) == 0
+            assert len(final_output.outputs[0].token_ids) > 0
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+
+async def collect_outputs(
+    engine: AsyncLLM,
+    request_id: str,
+    prompt: PromptType,
+    sampling_params: SamplingParams,
+    outputs_list: list[RequestOutput],
+) -> Optional[RequestOutput]:
+    """Helper to collect outputs and return the final one."""
+    final_output: Optional[RequestOutput] = None
+    async for output in engine.generate(request_id=request_id,
+                                        prompt=prompt,
+                                        sampling_params=sampling_params):
+        if not output.finished:
+            outputs_list.append(output)
+        final_output = output
+    return final_output
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 3be6c4821214..2ee55b585da6 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -107,6 +107,7 @@ def __init__(
         self.max_tokens_param = max_tokens_param
         self.is_prefilling = True
         self.queue = queue
+        self.num_cached_tokens = 0
 
         self.stats = RequestStateStats(
             arrival_time=arrival_time) if log_stats else None
@@ -167,7 +168,6 @@ def make_request_output(
         finish_reason: Optional[FinishReason],
         stop_reason: Union[int, str, None],
         kv_transfer_params: Optional[dict[str, Any]] = None,
-        num_cached_tokens: int = 0,
     ) -> Optional[Union[RequestOutput, PoolingRequestOutput]]:
 
         finished = finish_reason is not None
@@ -195,7 +195,7 @@ def make_request_output(
                 return None
 
         return self._new_request_output(request_id, outputs, finished,
-                                        kv_transfer_params, num_cached_tokens)
+                                        kv_transfer_params)
 
     def _new_request_output(
         self,
@@ -203,14 +203,14 @@ def _new_request_output(
         outputs: Union[list[CompletionOutput], list[PoolingOutput]],
         finished: bool,
         kv_transfer_params: Optional[dict[str, Any]] = None,
-        num_cached_tokens: int = 0,
     ) -> Union[RequestOutput, PoolingRequestOutput]:
 
-        if isinstance(outputs[0], PoolingOutput):
+        first_output = outputs[0]
+        if isinstance(first_output, PoolingOutput):
             assert len(outputs) == 1
             return PoolingRequestOutput(
                 request_id=request_id,
-                outputs=outputs[0],
+                outputs=first_output,
                 prompt_token_ids=self.prompt_token_ids,
                 finished=finished,
             )
@@ -229,7 +229,7 @@ def _new_request_output(
             outputs=cast(list[CompletionOutput], outputs),
             finished=finished,
             kv_transfer_params=kv_transfer_params,
-            num_cached_tokens=num_cached_tokens,
+            num_cached_tokens=self.num_cached_tokens,
         )
 
     def _new_completion_output(
@@ -308,11 +308,18 @@ def abort_requests(
             if req_state is not None:
                 self.lora_states.abort_request(req_state)
                 request_ids_to_abort.append(request_id)
-            else:
-                parent = self.parent_requests.pop(request_id, None)
-                if parent and parent.child_requests:
-                    self.abort_requests(parent.child_requests)
-                    request_ids_to_abort.extend(parent.child_requests)
+                # Produce final abort output.
+                if req_state.queue is not None and (
+                        request_output := req_state.make_request_output(
+                            [], None, FinishReason.ABORT, None, None)):
+                    req_state.queue.put(request_output)
+            elif parent := self.parent_requests.get(request_id):
+                # Abort children prior to removing the parent.
+                if parent.child_requests:
+                    child_reqs = list(parent.child_requests)
+                    child_reqs = self.abort_requests(child_reqs)
+                    request_ids_to_abort.extend(child_reqs)
+                self.parent_requests.pop(request_id, None)
         return request_ids_to_abort
 
     def add_request(
@@ -390,7 +397,7 @@ def process_outputs(
             finish_reason = engine_core_output.finish_reason
             stop_reason = engine_core_output.stop_reason
             kv_transfer_params = engine_core_output.kv_transfer_params
-            num_cached_tokens = engine_core_output.num_cached_tokens
+            req_state.num_cached_tokens = engine_core_output.num_cached_tokens
             req_state.is_prefilling = False
 
             if pooling_output is None:
@@ -411,7 +418,7 @@ def process_outputs(
             # 4) Create and handle RequestOutput objects.
             if request_output := req_state.make_request_output(
                     new_token_ids, pooling_output, finish_reason, stop_reason,
-                    kv_transfer_params, num_cached_tokens):
+                    kv_transfer_params):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put(request_output)

From 328f34493ed82795f9565914ffca0126b3fb03f3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 15:20:28 -0700
Subject: [PATCH 072/233] [BugFix] Fix initial DP request load imbalance
 (#22910)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 5ffa555570a2..29ee0a9dfb1e 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -965,7 +965,7 @@ def __init__(self,
 
         # List of [waiting, running] pair per engine.
         # Used only by DPLBAsyncMPClient subclass.
-        self.lb_engines: list[list[int]] = []
+        self.lb_engines: list[list[int]] = [[0, 0] for _ in self.core_engines]
 
         self.first_req_sock_addr = get_open_zmq_inproc_path()
         self.first_req_send_socket = self.resources.first_req_send_socket = (
@@ -1121,10 +1121,8 @@ def __init__(self,
     def get_core_engine_for_request(
             self, request: EngineCoreRequest) -> EngineIdentity:
         # Engines are in rank order.
-        current_counts = self.lb_engines
         if (eng_index := request.data_parallel_rank) is None:
-            if not current_counts:
-                return self.core_engine
+            current_counts = self.lb_engines
             # TODO use P2C alg for larger DP sizes
             num_engines = len(current_counts)
             min_score = sys.maxsize

From 0a8da7e0c90a00ccfa3bd4cf77e8bc39fd90db99 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Thu, 14 Aug 2025 19:37:22 -0400
Subject: [PATCH 073/233] [Bugfix] use flash attn on sm90 (#22933)

Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/platforms/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 63f6b373c322..483d5e1531a9 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -316,7 +316,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
 
             # FlashAttention is the default for SM 8.0+ GPUs
             if cls.has_device_capability(80):
-                if has_sink:
+                if has_sink and not cls.is_device_capability(90):
                     logger.info_once("Using Triton backend on V1 engine.")
                     return TRITON_ATTN_VLLM_V1
                 if is_default_backend_supported := is_attn_backend_supported(

From 1c51e0b8377fa27a12f9e27ad39fd9efe3d344b2 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 15 Aug 2025 08:21:29 +0800
Subject: [PATCH 074/233] [Kernel]  Add cuda kernel for gpt_oss activation
 (#22538)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 csrc/activation_kernels.cu                    | 59 +++++++++++++++++++
 csrc/ops.h                                    |  2 +
 csrc/torch_bindings.cpp                       |  5 ++
 tests/kernels/core/test_activation.py         | 45 ++++++++++++--
 vllm/model_executor/layers/activation.py      | 41 ++++++++++++-
 .../layers/fused_moe/fused_moe.py             | 18 ++----
 .../layers/quantization/utils/mxfp4_utils.py  |  2 +-
 vllm/model_executor/models/gpt_oss.py         |  2 +-
 8 files changed, 150 insertions(+), 24 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 55e659679701..a4a880f13cf7 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param(
   }
 }
 
+template <typename T>
+__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
+                                               float alpha, float limit) {
+  // clamp gate: min=None, max=limit
+  const float gate_f = (float)gate;
+  const float clamped_gate = gate_f > limit ? limit : gate_f;
+
+  // clamp up: min=-limit, max=limit
+  const float up_f = (float)up;
+  const float clamped_up =
+      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
+
+  // glu = gate * sigmoid(gate * alpha)
+  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
+  const float glu = clamped_gate * sigmoid_val;
+
+  // (up + 1) * glu
+  return (T)((clamped_up + 1.0f) * glu);
+}
+
+template <typename scalar_t,
+          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
+                             const float)>
+__global__ void swigluoai_and_mul_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d, const float alpha, const float limit) {
+  const int64_t token_idx = blockIdx.x;
+  // TODO: Vectorize loads and stores.
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    // gate = x[..., ::2]  (even indices)
+    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
+    // up = x[..., 1::2]   (odd indices)
+    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
+
+    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  }
+}
+
 }  // namespace vllm
 
 #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
@@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param(
                                          PARAM);                        \
       });
 
+#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT)                          \
+  int d = input.size(-1) / 2;                                                  \
+  int64_t num_tokens = input.numel() / input.size(-1);                         \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                \
+      input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] {            \
+        vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),             \
+                                         input.data_ptr<scalar_t>(), d, ALPHA, \
+                                         LIMIT);                               \
+      });
+
 void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
                      torch::Tensor& input,  // [..., 2 * d]
                      double threshold) {
   LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
 }
+void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input,  // [..., 2 * d]
+                       double alpha, double limit) {
+  LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
+}
 namespace vllm {
 
 // Element-wise activation kernel template.
diff --git a/csrc/ops.h b/csrc/ops.h
index 207291eceb16..8b41b95473a1 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
                      double threshold);
+void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
+                       double alpha = 1.702, double limit = 7.0);
 
 void gelu_new(torch::Tensor& out, torch::Tensor& input);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8c207be083d8..41e9bc8a5e01 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -130,6 +130,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
   ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
 
+  ops.def(
+      "swigluoai_and_mul(Tensor! out, Tensor input, float alpha, float limit) "
+      "-> ()");
+  ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul);
+
   // GELU implementation used in GPT-2.
   ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_new", torch::kCUDA, &gelu_new);
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 29c5e70a8ba8..ec5c60fd7b0e 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -11,7 +11,7 @@
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
                                                    NewGELU, QuickGELU,
-                                                   SiluAndMul)
+                                                   SiluAndMul, SwigluOAIAndMul)
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -25,7 +25,15 @@
 
 @pytest.mark.parametrize(
     "activation",
-    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
+    [
+        "silu_and_mul",
+        "mul_and_silu",
+        "gelu",
+        "gelu_tanh",
+        "fatrelu",
+        "swigluoai_and_mul",
+    ],
+)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -59,18 +67,43 @@ def test_act_and_mul(
         threshold = random.uniform(0, 1)
         layer = FatreluAndMul(threshold)
         fn = torch.ops._C.fatrelu_and_mul
+    elif activation == "swigluoai_and_mul":
+        layer = SwigluOAIAndMul()
+        fn = torch.ops._C.swigluoai_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
-    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
-    # equivalent to the native PyTorch implementations, so we can do exact
-    # comparison.
-    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+    if activation == "swigluoai_and_mul":
+
+        rtol = {
+            #For fp16, change the relative tolerance from 1e-3 to 2e-3
+            torch.float16:
+            2e-3,
+            torch.bfloat16:
+            2e-2,
+            torch.float:
+            1.3e-6
+        }
+
+        def _get_rtol(output) -> float:
+            return rtol[output.dtype]
+
+        torch.testing.assert_close(out,
+                                   ref_out,
+                                   atol=get_default_atol(out),
+                                   rtol=_get_rtol(out))
+    else:
+        # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+        # equivalent to the native PyTorch implementations, so we can do exact
+        # comparison.
+        torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
     output_shape = (x.shape[:-1] + (d, ))
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     if activation == "fatrelu":
         opcheck(fn, (out, x, threshold))
+    elif activation == "swigluoai_and_mul":
+        opcheck(fn, (out, x, layer.alpha, layer.limit))
     else:
         opcheck(fn, (out, x))
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 7ce44174ead6..5f89dadec8b8 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -239,6 +239,35 @@ def extra_repr(self) -> str:
         return f'approximate={repr(self.approximate)}'
 
 
+@CustomOp.register("swigluoai_and_mul")
+class SwigluOAIAndMul(CustomOp):
+    # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
+    def __init__(self, alpha: float = 1.702, limit: float = 7.0):
+        super().__init__()
+        self.alpha = alpha
+        self.limit = limit
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+
+        gate, up = x[..., ::2], x[..., 1::2]
+        gate = gate.clamp(min=None, max=self.limit)
+        up = up.clamp(min=-self.limit, max=self.limit)
+        glu = gate * torch.sigmoid(gate * self.alpha)
+        gated_output = (up + 1) * glu
+        return gated_output
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
+        return out
+
+    def extra_repr(self) -> str:
+        return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
+
+
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
@@ -330,6 +359,7 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        #TODO : implement cuda kenrels
         return self.forward_native(x)
 
 
@@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 
 
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
-    "gelu": lambda: GeluAndMul(),
-    "silu": lambda: SiluAndMul(),
-    "geglu": lambda: GeluAndMul(),
+    "gelu":
+    lambda: GeluAndMul(),
+    "silu":
+    lambda: SiluAndMul(),
+    "geglu":
+    lambda: GeluAndMul(),
+    "swigluoai_and_mul":
+    lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
 })
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 98087a35e15c..23ebad36daf2 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1633,17 +1633,6 @@ def fused_experts_impl(
                                 block_shape=block_shape,
                                 B_bias=w1_bias)
 
-        # TODO fused kernel
-        def swiglu_oai(gate_up):
-            alpha = 1.702
-            limit = 7.0
-            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-            gate = gate.clamp(min=None, max=limit)
-            up = up.clamp(min=-limit, max=limit)
-            glu = gate * torch.sigmoid(gate * alpha)
-            gated_output = (up + 1) * glu
-            return gated_output
-
         # Activation function with multiplication
         if activation == "silu" and is_act_and_mul:
             torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -1651,13 +1640,16 @@ def swiglu_oai(gate_up):
         elif activation == "gelu" and is_act_and_mul:
             torch.ops._C.gelu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
+        elif activation == "swigluoai" and is_act_and_mul:
+            # alpha = 1.702, limit = 7.0
+            torch.ops._C.swigluoai_and_mul(intermediate_cache2,
+                                           intermediate_cache1.view(-1, N))
         # Activation function without multiplication
         elif activation == "silu":
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
         elif activation == "gelu":
             intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-        elif activation == "swiglu_oai":
-            intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N))
+
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
                              f"with is_act_and_mul={is_act_and_mul}.")
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index deeb69bcad0e..dca38a019e9b 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -68,7 +68,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
     return not (use_grouped_topk or topk_group or num_expert_group
                 or expert_map or custom_routing_function
                 or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swiglu_oai"
+                or scoring_func != "softmax" or activation != "swigluoai"
                 or expert_load_view or logical_to_physical_map
                 or logical_replica_count)
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 7c7712dbe106..2f5d9ddd9054 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -159,7 +159,7 @@ def __init__(
                                 prefix=f"{prefix}.experts",
                                 apply_router_weight_on_input=False,
                                 has_bias=True,
-                                activation="swiglu_oai")
+                                activation="swigluoai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         t = self.norm(x)

From ab5727a77e86809bc7fdde611bf472ffbfa012c1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 14 Aug 2025 17:38:10 -0700
Subject: [PATCH 075/233] Revert "[Kernel]  Add cuda kernel for gpt_oss
 activation" (#22948)

---
 csrc/activation_kernels.cu                    | 59 -------------------
 csrc/ops.h                                    |  2 -
 csrc/torch_bindings.cpp                       |  5 --
 tests/kernels/core/test_activation.py         | 45 ++------------
 vllm/model_executor/layers/activation.py      | 41 +------------
 .../layers/fused_moe/fused_moe.py             | 18 ++++--
 .../layers/quantization/utils/mxfp4_utils.py  |  2 +-
 vllm/model_executor/models/gpt_oss.py         |  2 +-
 8 files changed, 24 insertions(+), 150 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index a4a880f13cf7..55e659679701 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -128,45 +128,6 @@ __global__ void act_and_mul_kernel_with_param(
   }
 }
 
-template <typename T>
-__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
-                                               float alpha, float limit) {
-  // clamp gate: min=None, max=limit
-  const float gate_f = (float)gate;
-  const float clamped_gate = gate_f > limit ? limit : gate_f;
-
-  // clamp up: min=-limit, max=limit
-  const float up_f = (float)up;
-  const float clamped_up =
-      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
-
-  // glu = gate * sigmoid(gate * alpha)
-  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
-  const float glu = clamped_gate * sigmoid_val;
-
-  // (up + 1) * glu
-  return (T)((clamped_up + 1.0f) * glu);
-}
-
-template <typename scalar_t,
-          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
-                             const float)>
-__global__ void swigluoai_and_mul_kernel(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
-    const int d, const float alpha, const float limit) {
-  const int64_t token_idx = blockIdx.x;
-  // TODO: Vectorize loads and stores.
-  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
-    // gate = x[..., ::2]  (even indices)
-    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
-    // up = x[..., 1::2]   (odd indices)
-    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
-
-    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
-  }
-}
-
 }  // namespace vllm
 
 #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
@@ -184,31 +145,11 @@ __global__ void swigluoai_and_mul_kernel(
                                          PARAM);                        \
       });
 
-#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT)                          \
-  int d = input.size(-1) / 2;                                                  \
-  int64_t num_tokens = input.numel() / input.size(-1);                         \
-  dim3 grid(num_tokens);                                                       \
-  dim3 block(std::min(d, 1024));                                               \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                \
-      input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] {            \
-        vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
-            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),             \
-                                         input.data_ptr<scalar_t>(), d, ALPHA, \
-                                         LIMIT);                               \
-      });
-
 void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
                      torch::Tensor& input,  // [..., 2 * d]
                      double threshold) {
   LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
 }
-void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
-                       torch::Tensor& input,  // [..., 2 * d]
-                       double alpha, double limit) {
-  LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
-}
 namespace vllm {
 
 // Element-wise activation kernel template.
diff --git a/csrc/ops.h b/csrc/ops.h
index 8b41b95473a1..207291eceb16 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -138,8 +138,6 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
                      double threshold);
-void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
-                       double alpha = 1.702, double limit = 7.0);
 
 void gelu_new(torch::Tensor& out, torch::Tensor& input);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 41e9bc8a5e01..8c207be083d8 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -130,11 +130,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
   ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
 
-  ops.def(
-      "swigluoai_and_mul(Tensor! out, Tensor input, float alpha, float limit) "
-      "-> ()");
-  ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul);
-
   // GELU implementation used in GPT-2.
   ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_new", torch::kCUDA, &gelu_new);
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index ec5c60fd7b0e..29c5e70a8ba8 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -11,7 +11,7 @@
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
                                                    NewGELU, QuickGELU,
-                                                   SiluAndMul, SwigluOAIAndMul)
+                                                   SiluAndMul)
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -25,15 +25,7 @@
 
 @pytest.mark.parametrize(
     "activation",
-    [
-        "silu_and_mul",
-        "mul_and_silu",
-        "gelu",
-        "gelu_tanh",
-        "fatrelu",
-        "swigluoai_and_mul",
-    ],
-)
+    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -67,43 +59,18 @@ def test_act_and_mul(
         threshold = random.uniform(0, 1)
         layer = FatreluAndMul(threshold)
         fn = torch.ops._C.fatrelu_and_mul
-    elif activation == "swigluoai_and_mul":
-        layer = SwigluOAIAndMul()
-        fn = torch.ops._C.swigluoai_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
-    if activation == "swigluoai_and_mul":
-
-        rtol = {
-            #For fp16, change the relative tolerance from 1e-3 to 2e-3
-            torch.float16:
-            2e-3,
-            torch.bfloat16:
-            2e-2,
-            torch.float:
-            1.3e-6
-        }
-
-        def _get_rtol(output) -> float:
-            return rtol[output.dtype]
-
-        torch.testing.assert_close(out,
-                                   ref_out,
-                                   atol=get_default_atol(out),
-                                   rtol=_get_rtol(out))
-    else:
-        # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
-        # equivalent to the native PyTorch implementations, so we can do exact
-        # comparison.
-        torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+    # equivalent to the native PyTorch implementations, so we can do exact
+    # comparison.
+    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
     output_shape = (x.shape[:-1] + (d, ))
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     if activation == "fatrelu":
         opcheck(fn, (out, x, threshold))
-    elif activation == "swigluoai_and_mul":
-        opcheck(fn, (out, x, layer.alpha, layer.limit))
     else:
         opcheck(fn, (out, x))
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 5f89dadec8b8..7ce44174ead6 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -239,35 +239,6 @@ def extra_repr(self) -> str:
         return f'approximate={repr(self.approximate)}'
 
 
-@CustomOp.register("swigluoai_and_mul")
-class SwigluOAIAndMul(CustomOp):
-    # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
-    def __init__(self, alpha: float = 1.702, limit: float = 7.0):
-        super().__init__()
-        self.alpha = alpha
-        self.limit = limit
-
-    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
-        """PyTorch-native implementation equivalent to forward()."""
-
-        gate, up = x[..., ::2], x[..., 1::2]
-        gate = gate.clamp(min=None, max=self.limit)
-        up = up.clamp(min=-self.limit, max=self.limit)
-        glu = gate * torch.sigmoid(gate * self.alpha)
-        gated_output = (up + 1) * glu
-        return gated_output
-
-    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        output_shape = (x.shape[:-1] + (d, ))
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
-        return out
-
-    def extra_repr(self) -> str:
-        return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
-
-
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
@@ -359,7 +330,6 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
-        #TODO : implement cuda kenrels
         return self.forward_native(x)
 
 
@@ -436,14 +406,9 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 
 
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
-    "gelu":
-    lambda: GeluAndMul(),
-    "silu":
-    lambda: SiluAndMul(),
-    "geglu":
-    lambda: GeluAndMul(),
-    "swigluoai_and_mul":
-    lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
+    "gelu": lambda: GeluAndMul(),
+    "silu": lambda: SiluAndMul(),
+    "geglu": lambda: GeluAndMul(),
 })
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 23ebad36daf2..98087a35e15c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1633,6 +1633,17 @@ def fused_experts_impl(
                                 block_shape=block_shape,
                                 B_bias=w1_bias)
 
+        # TODO fused kernel
+        def swiglu_oai(gate_up):
+            alpha = 1.702
+            limit = 7.0
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=limit)
+            up = up.clamp(min=-limit, max=limit)
+            glu = gate * torch.sigmoid(gate * alpha)
+            gated_output = (up + 1) * glu
+            return gated_output
+
         # Activation function with multiplication
         if activation == "silu" and is_act_and_mul:
             torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -1640,16 +1651,13 @@ def fused_experts_impl(
         elif activation == "gelu" and is_act_and_mul:
             torch.ops._C.gelu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
-        elif activation == "swigluoai" and is_act_and_mul:
-            # alpha = 1.702, limit = 7.0
-            torch.ops._C.swigluoai_and_mul(intermediate_cache2,
-                                           intermediate_cache1.view(-1, N))
         # Activation function without multiplication
         elif activation == "silu":
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
         elif activation == "gelu":
             intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-
+        elif activation == "swiglu_oai":
+            intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N))
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
                              f"with is_act_and_mul={is_act_and_mul}.")
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index dca38a019e9b..deeb69bcad0e 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -68,7 +68,7 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
     return not (use_grouped_topk or topk_group or num_expert_group
                 or expert_map or custom_routing_function
                 or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swigluoai"
+                or scoring_func != "softmax" or activation != "swiglu_oai"
                 or expert_load_view or logical_to_physical_map
                 or logical_replica_count)
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 2f5d9ddd9054..7c7712dbe106 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -159,7 +159,7 @@ def __init__(
                                 prefix=f"{prefix}.experts",
                                 apply_router_weight_on_input=False,
                                 has_bias=True,
-                                activation="swigluoai")
+                                activation="swiglu_oai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         t = self.norm(x)

From 613b55b599efa6b0c72704fd7a069d2c31ad356a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 18:39:43 -0700
Subject: [PATCH 076/233] [BugFix][KVConn] Fix use of
 `get_required_kvcache_layout` (#22734)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/distributed/kv_transfer/kv_connector/v1/base.py         | 4 ++++
 .../kv_transfer/kv_connector/v1/multi_connector.py           | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index b72104397822..07fcdecac627 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -325,4 +325,8 @@ def get_required_kvcache_layout(
             str: the required KV cache layout. e.g. HND, or NHD.
             None if the connector does not require a specific layout.
         """
+
+        if cls is KVConnectorBase_V1:
+            raise TypeError("get_required_kvcache_layout should not be called "
+                            "on the abstract base class")
         return None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index 7d67c76e2f05..d3f6a226dc72 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -228,9 +228,10 @@ def get_required_kvcache_layout(
         for ktc in ktcs:
             kv_transfer_config = KVTransferConfig(**ktc)
             temp_vllm_config.kv_transfer_config = kv_transfer_config
+            connector_cls = KVConnectorFactory.get_connector_class(
+                kv_transfer_config)
             required_kvcache_layout = (
-                KVConnectorBase_V1.get_required_kvcache_layout(
-                    temp_vllm_config))
+                connector_cls.get_required_kvcache_layout(temp_vllm_config))
             if required_kvcache_layout is not None:
                 layouts.add(required_kvcache_layout)
 

From 822efc410f8600d9d30525b29b8edb25bce43077 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 14 Aug 2025 20:17:11 -0700
Subject: [PATCH 077/233] [BugFix] Fix port lookup in internal DP LB tests
 (#22252)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_internal_lb_dp.py | 54 +++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py
index ca80d3a4949d..2b031865cad7 100644
--- a/tests/v1/test_internal_lb_dp.py
+++ b/tests/v1/test_internal_lb_dp.py
@@ -4,6 +4,8 @@
 import os
 import threading
 import time
+import traceback
+from typing import Optional, cast
 
 import openai  # use the official client for correctness check
 import pytest
@@ -41,12 +43,15 @@ def __init__(self,
         self.tp_size = tp_size
         self.api_server_count = api_server_count
         self.base_server_args = base_server_args
-        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.servers: list[Optional[tuple[RemoteOpenAIServer,
+                                          list[str]]]] = [None] * (dp_size //
+                                                                   dp_per_node)
         self.server_threads: list[threading.Thread] = []
 
     def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
         """Start all server instances for multi-node internal LB mode."""
-        for rank in range(0, self.dp_size, self.dp_per_node):
+        for server_idx, rank in enumerate(
+                range(0, self.dp_size, self.dp_per_node)):
             # Create server args for this specific rank
             server_args = self.base_server_args.copy()
 
@@ -87,7 +92,7 @@ def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
                 ])
 
             # Use a thread to start each server to allow parallel initialization
-            def start_server(r: int, sargs: list[str]):
+            def start_server(sidx: int, r: int, sargs: list[str]):
                 gpus_per_node = self.tp_size * self.dp_per_node
                 try:
                     # Start the server
@@ -110,13 +115,14 @@ def start_server(r: int, sargs: list[str]):
                             f"{self.api_server_count} API servers")
                     else:
                         print(f"Headless node (rank {r}) started successfully")
-                    self.servers.append((server, sargs))
+                    self.servers[sidx] = (server, sargs)
                 except Exception as e:
                     print(f"Failed to start server rank {r}: {e}")
+                    traceback.print_exc()
                     raise
 
             thread = threading.Thread(target=start_server,
-                                      args=(rank, server_args))
+                                      args=(server_idx, rank, server_args))
             thread.start()
 
             self.server_threads.append(thread)
@@ -128,18 +134,20 @@ def start_server(r: int, sargs: list[str]):
         # Give servers additional time to fully initialize and coordinate
         time.sleep(3)
 
-        if len(self.servers) != self.dp_size // self.dp_per_node:
+        if not all(self.servers):
             raise Exception("Servers failed to start")
 
-        return self.servers
+        return cast(list[tuple[RemoteOpenAIServer, list[str]]], self.servers)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Stop all server instances."""
         while self.servers:
-            try:
-                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
-            except Exception as e:
-                print(f"Error stopping server: {e}")
+            if server := self.servers.pop():
+                try:
+                    server[0].__exit__(exc_type, exc_val, exc_tb)
+                except Exception as e:
+                    print(f"Error stopping server: {e}")
+                    traceback.print_exc()
 
 
 class APIOnlyServerManager:
@@ -157,7 +165,8 @@ def __init__(self,
         self.tp_size = tp_size
         self.api_server_count = api_server_count
         self.base_server_args = base_server_args
-        self.servers: list[tuple[RemoteOpenAIServer, list[str]]] = []
+        self.servers: list[Optional[tuple[RemoteOpenAIServer,
+                                          list[str]]]] = [None] * 2
         self.server_threads: list[threading.Thread] = []
 
     def __enter__(self) -> list[tuple[RemoteOpenAIServer, list[str]]]:
@@ -209,7 +218,7 @@ def start_api_server():
                 server.__enter__()
                 print(f"API-only server started successfully with "
                       f"{self.api_server_count} API servers")
-                self.servers.append((server, api_server_args))
+                self.servers[0] = (server, api_server_args)
             except Exception as e:
                 print(f"Failed to start API-only server: {e}")
                 raise
@@ -231,7 +240,7 @@ def start_engines_server():
                 server.__enter__()
                 print(f"Headless engines server started successfully with "
                       f"{self.dp_size} engines")
-                self.servers.append((server, engines_server_args))
+                self.servers[1] = (server, engines_server_args)
             except Exception as e:
                 print(f"Failed to start headless engines server: {e}")
                 raise
@@ -253,18 +262,20 @@ def start_engines_server():
         # Give servers additional time to fully initialize and coordinate
         time.sleep(3)
 
-        if len(self.servers) != 2:
+        if not all(self.servers):
             raise Exception("Both servers failed to start")
 
-        return self.servers
+        return cast(list[tuple[RemoteOpenAIServer, list[str]]], self.servers)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Stop both server instances."""
         while self.servers:
-            try:
-                self.servers.pop()[0].__exit__(exc_type, exc_val, exc_tb)
-            except Exception as e:
-                print(f"Error stopping server: {e}")
+            if server := self.servers.pop():
+                try:
+                    server[0].__exit__(exc_type, exc_val, exc_tb)
+                except Exception as e:
+                    print(f"Error stopping server: {e}")
+                    traceback.print_exc()
 
 
 @pytest.fixture(scope="module")
@@ -560,7 +571,7 @@ async def make_request():
     assert len(results) == num_requests
     assert all(completion is not None for completion in results)
 
-    _, api_server_args = api_only_servers[0]
+    api_server, api_server_args = api_only_servers[0]
     api_server_count = (
         api_server_args.count('--api-server-count')
         and api_server_args[api_server_args.index('--api-server-count') + 1]
@@ -569,7 +580,6 @@ async def make_request():
           f"engines on headless server (API server count: {api_server_count})")
 
     # Check request balancing via Prometheus metrics
-    api_server = api_only_servers[0][0]
     check_request_balancing(api_server, DP_SIZE)
 
 
From 0955fd83d4a7f9e4b88b5317c52556f5156e06e4 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 14 Aug 2025 23:25:34 -0400
Subject: [PATCH 078/233] [CI Perf] Prune tests in
 `tests/kernels/quantization/` (#22942)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/kernels/quantization/test_fp8_quant.py  |  8 +--
 tests/kernels/quantization/test_int8_quant.py |  7 +--
 tests/kernels/quantization/test_machete_mm.py |  4 --
 .../kernels/quantization/test_marlin_gemm.py  |  4 --
 .../quantization/test_rocm_skinny_gemms.py    | 60 +++++++++++++++----
 .../quantization/test_triton_scaled_mm.py     | 16 +++--
 6 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
index 0a3edd4ddc16..c2e70ffb8d34 100644
--- a/tests/kernels/quantization/test_fp8_quant.py
+++ b/tests/kernels/quantization/test_fp8_quant.py
@@ -11,11 +11,9 @@
 from tests.kernels.utils import opcheck
 from vllm.platforms import current_platform
 
-DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192,
-                8193]  # Arbitrary values for testing
-HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
-NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
+DTYPES = [torch.bfloat16, torch.float]
+HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
+NUM_TOKENS = [1, 7, 4096]
 SCALE_UBS = [True, False]
 SEEDS = [0]
 
diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
index 5a37b976db9e..c1c9bf191d5b 100644
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -9,10 +9,9 @@
 from vllm._custom_ops import scaled_int8_quant
 from vllm.platforms import current_platform
 
-DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [16, 67, 768, 5137, 8193]  # Arbitrary values for testing
-HIDDEN_SIZES += list(range(1024, 1033))  # vectorized conversion edge cases
-NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
+DTYPES = [torch.bfloat16, torch.float]
+HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
+NUM_TOKENS = [1, 7, 4096]
 SEEDS = [0]
 SCALE = [0.1, 2.1]
 
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index a7cb2a4e7f21..a842d2f1cbe8 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -34,8 +34,6 @@
 
 MNK_SHAPES = [
     (1, 128, 128),
-    (1, 512, 1024),
-    (1, 4096, 4096),
     (1, 8192, 28672),
     (13, 8192, 4096),
     (26, 4096, 8192),
@@ -43,8 +41,6 @@
     (64, 8192, 28672),
     (257, 128, 4096),
     (257, 4224, 4160),
-    (257, 4096, 4096),
-    (1024, 4096, 8192),
     (1024, 8192, 4096),
 ]
 
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 1bd6713ce7fb..cea7700ac329 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -53,12 +53,8 @@
 MNK_FACTORS = [
     (1, 1, 1),
     (1, 4, 8),
-    (1, 7, 5),
-    (13, 17, 67),
     (26, 37, 13),
-    (67, 13, 11),
     (257, 13, 11),
-    (658, 13, 11),
 ]
 
 DTYPES = [torch.float16, torch.bfloat16]
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index 533a4fe59677..03d5d98739c5 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -8,15 +8,55 @@
 from vllm.platforms import current_platform
 
 DTYPES = [torch.bfloat16, torch.float16]
-M = [16, 32, 64, 128, 256, 512, 1024, 4096, 8192]
-K = [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 6144, 8192]  # k % 8 == 0
-N = [1, 2, 3, 4]
+# Specific (N, K, M) combinations for targeted testing
+NKM_FACTORS_LLMM1 = [
+    # Small, medium, large cases
+    (1, 8, 16),
+    (1, 32, 64),
+    (1, 128, 256),
+    (1, 512, 1024),
+    (1, 2048, 4096),
+    # Edge cases with specific K sizes
+    (1, 6144, 1024),
+    (1, 8192, 2048),
+    # Very large case
+    (1, 4096, 8192),
+]
+
+NKM_FACTORS_WVSPLITK = [
+    # Different batch sizes with key dimensions
+    (1, 16, 16),
+    (1, 64, 64),
+    (2, 256, 256),
+    (3, 1024, 1024),
+    (4, 4096, 4096),
+    # Extended K values
+    (1, 9216, 512),
+    (2, 10240, 1024),
+    (4, 16384, 8192),
+    # Minimum M constraint validation (m >= 8)
+    (1, 64, 8),
+    (2, 128, 8),
+    (4, 256, 8),
+]
+
+NKM_FACTORS_WVSPLITK_FP8 = [
+    # FP8-specific cases with K % 16 == 0
+    (1, 16, 16),
+    (1, 64, 64),
+    (2, 512, 512),
+    (3, 2048, 2048),
+    (4, 4096, 4096),
+    # Extended FP8 dimensions not covered by WVSPLITK
+    (1, 14336, 1024),
+    (2, 24576, 2048),
+    (4, 32768, 28672),
+]
+
 SEEDS = [0]
 
 
-@pytest.mark.parametrize("n", [1])  # only test for batch size 1
-@pytest.mark.parametrize("k", K)
-@pytest.mark.parametrize("m", M)
+@pytest.mark.parametrize("n,k,m", NKM_FACTORS_LLMM1)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("rows_per_block", [2, 4, 8, 16])
 @pytest.mark.parametrize("seed", SEEDS)
@@ -34,9 +74,7 @@ def test_rocm_llmm1_kernel(n, k, m, dtype, rows_per_block, seed):
     assert torch.allclose(out, ref_out, rtol=0.01)
 
 
-@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
-@pytest.mark.parametrize("k", K + [9216, 10240, 16384])
-@pytest.mark.parametrize("m", [8] + M)  # m >= 8
+@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not current_platform.is_rocm(),
@@ -54,9 +92,7 @@ def test_rocm_wvsplitk_kernel(n, k, m, dtype, seed):
     assert torch.allclose(out, ref_out, rtol=0.01)
 
 
-@pytest.mark.parametrize("n", N)  # only test for batch size <= 4
-@pytest.mark.parametrize("k", K[1:] + [14336, 24576, 32768])  # k % 16 == 0
-@pytest.mark.parametrize("m", M + [28672])  # m >= 16
+@pytest.mark.parametrize("n,k,m", NKM_FACTORS_WVSPLITK_FP8)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 8a2cc3baced2..24245663fb1d 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -60,10 +60,18 @@ def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
                                             num_logprobs)
 
 
-@pytest.mark.parametrize("M", [1, 33, 64, 512])
-@pytest.mark.parametrize("N", [256, 971, 20486])
-@pytest.mark.parametrize("K", [128, 496, 1024])
-@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16])
+MNK_FACTORS = [
+    (1, 256, 128),
+    (33, 256, 496),
+    (64, 971, 1024),
+    (64, 20486, 128),
+    (512, 256, 496),
+    (512, 20486, 1024),
+]
+
+
+@pytest.mark.parametrize("M,N,K", MNK_FACTORS)
+@pytest.mark.parametrize("out_dtype", [torch.bfloat16])
 @pytest.mark.parametrize("in_dtype", get_8bit_types())
 @pytest.mark.parametrize("use_scalar_scale_a", [True, False])
 @pytest.mark.parametrize("use_scalar_scale_b", [True, False])

From 77266f1d6b2f23afb2d691ae236ce349b2b5a218 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 14 Aug 2025 23:33:42 -0400
Subject: [PATCH 079/233] [CI Perf] Prune tests in `tests/kernels/moe/`
 (#22939)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/kernels/moe/test_batched_moe.py         | 13 +++-----
 .../moe/test_count_expert_num_tokens.py       |  5 ++-
 tests/kernels/moe/test_moe.py                 | 33 +++++++++++++------
 .../kernels/moe/test_moe_align_block_size.py  |  6 ++--
 .../kernels/moe/test_moe_permute_unpermute.py |  8 ++---
 tests/kernels/moe/test_pplx_moe.py            | 12 +++++--
 6 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 69317405d48b..edf3e6189243 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -89,14 +89,11 @@ def make_tensors(config: BatchedMMConfig):
         return BatchedMMTensors(A, B, C, num_expert_tokens)
 
 
-@pytest.mark.parametrize("num_experts", [8, 16, 32])
-@pytest.mark.parametrize("max_tokens_per_expert",
-                         [32, 64, 128, 192, 224, 256, 512])
-@pytest.mark.parametrize("K", [128, 256, 1024])
-@pytest.mark.parametrize("N", [128, 256, 1024])
-@pytest.mark.parametrize(
-    "dtype",
-    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("num_experts", [8, 32])
+@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
+@pytest.mark.parametrize("K", [128, 1024])
+@pytest.mark.parametrize("N", [128, 1024])
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
diff --git a/tests/kernels/moe/test_count_expert_num_tokens.py b/tests/kernels/moe/test_count_expert_num_tokens.py
index 0872836b6064..1768baaf1ca7 100644
--- a/tests/kernels/moe/test_count_expert_num_tokens.py
+++ b/tests/kernels/moe/test_count_expert_num_tokens.py
@@ -113,8 +113,7 @@ def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
                                    rtol=0)
 
 
-@pytest.mark.parametrize(
-    "num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317])
+@pytest.mark.parametrize("num_tokens", [1, 4, 8, 11, 127, 128, 3333, 7317])
 @pytest.mark.parametrize("num_topk", [2, 6, 8])
 @pytest.mark.parametrize("num_experts", [64])
 @pytest.mark.parametrize("ep_size", [1, 2, 4])
@@ -126,7 +125,7 @@ def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
                                       ep_size, topk_ids_dtype)
 
 
-@pytest.mark.parametrize("numel", list(range(1, 8192, 11)))
+@pytest.mark.parametrize("numel", list(range(1, 8192, 111)))
 @pytest.mark.parametrize("num_experts", [32])
 @pytest.mark.parametrize("ep_size", [2])
 @pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index b82c74a42ab3..1951eb0c6180 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -42,6 +42,24 @@
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
 
+FUSED_MOE_MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 2048, 128),
+    (33, 2048, 128),
+    (222, 1024, 1024),
+    (32768, 128, 128),
+    (32768, 2048, 511),
+    (40000, 1024, 1024),
+]
+
+FUSED_MOE_WN16_MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 1024, 1024),
+    (32, 2048, 128),
+    (32, 1024, 1024),
+    (222, 2048, 1024),
+]
+
 vllm_config = VllmConfig()
 vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192
@@ -116,13 +134,11 @@ def run_moe_test(
     return baseline_output
 
 
-@pytest.mark.parametrize("m", [1, 33, 64, 222, 32768, 40000])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
 @pytest.mark.parametrize("chunk_size", [8192])
 def test_fused_moe(
@@ -235,13 +251,11 @@ def m_fused_moe(
                use_cudagraph=use_cudagraph)
 
 
-@pytest.mark.parametrize("m", [1, 32, 222])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 1024])
+@pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("group_size", [64, 128])
 @pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("weight_bits", [4, 8])
@@ -352,8 +366,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
 
 
-@pytest.mark.parametrize("dtype",
-                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
 @pytest.mark.parametrize(
     "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py
index 12ef9e776c3a..5dfc8d9fab32 100644
--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -15,10 +15,10 @@
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
-NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096]
-NUM_EXPERTS = [32, 160, 256, 257, 512]
+NUM_TOKENS = [1, 3, 256, 2256, 4096]
+NUM_EXPERTS = [32, 160, 256, 257]
 TOP_KS = [1, 2, 16, 32]
-BLOCK_SIZES = [32, 64, 128, 256]
+BLOCK_SIZES = [32, 128]
 current_platform.seed_everything(0)
 
 
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index 8d215a0cbeed..6ca01f9271bb 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -18,7 +18,7 @@
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [16, 64, 256]
-TOP_KS = [2, 4, 6, 8]
+TOP_KS = [2, 6, 8]
 EP_SIZE = [1, 4, 16]
 current_platform.seed_everything(0)
 
@@ -177,11 +177,11 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
     return output
 
 
-@pytest.mark.parametrize("n_token", [1, 33, 64, 222, 1024, 2048, 3000, 5000])
-@pytest.mark.parametrize("n_hidden", [2048, 4096, 7168])
+@pytest.mark.parametrize("n_token", [1, 33, 1024, 5000])
+@pytest.mark.parametrize("n_hidden", [2048, 7168])
 @pytest.mark.parametrize("n_expert", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("ep_size", EP_SIZE)
 @pytest.mark.parametrize("align_block_size", [None, 128])
 def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index f7a661b4bc7b..fbef6706beaf 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -44,6 +44,14 @@
     reason="Requires PPLX kernels",
 )
 
+BATCHED_MOE_MNK_FACTORS = [
+    (1, 128, 128),
+    (33, 2048, 128),
+    (64, 128, 2048),
+    (222, 128, 128),
+    (222, 2048, 1024),
+]
+
 PPLX_COMBOS = [
     # TODO: figure out why this fails, seems to be test problem
     #(1, 128, 128),
@@ -152,9 +160,7 @@ def torch_batched_moe(
     return torch_finalize(out, topk_weight, topk_ids)
 
 
-@pytest.mark.parametrize("m", [1, 33, 64, 222])
-@pytest.mark.parametrize("n", [128, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])
+@pytest.mark.parametrize("m,n,k", BATCHED_MOE_MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16])

From bf1e1b7451564b46ed710b0f998adf6cc3b70d14 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 14 Aug 2025 23:34:53 -0400
Subject: [PATCH 080/233] [CI Perf] Prune tests in `tests/kernels/attention/`
 (#22936)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../attention/test_aiter_flash_attn.py        |  6 ++---
 tests/kernels/attention/test_attention.py     |  7 ++----
 tests/kernels/attention/test_cache.py         |  4 ++--
 tests/kernels/attention/test_flash_attn.py    | 16 +++++++------
 tests/kernels/attention/test_flashinfer.py    | 24 ++++++++++---------
 .../test_flashinfer_trtllm_attention.py       |  6 ++---
 .../kernels/attention/test_prefix_prefill.py  |  6 ++---
 .../test_triton_unified_attention.py          |  8 +++----
 8 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
index d0687c62b113..2d882bdf4066 100644
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -9,10 +9,10 @@
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
 from vllm.platforms import current_platform
 
-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+BLOCK_SIZES = [16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 2e0b4efebfdb..7083661575ef 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -29,17 +29,14 @@
 NUM_BLOCKS = 4321  # Arbitrary values for testing
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
-# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
-DTYPES = [
-    torch.half, torch.bfloat16, torch.float
-] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 NUM_GEN_SEQS = [7]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 
 # This should be sync with get_supported_head_sizes() in
 # vllm.attention.ops.paged_attn.PagedAttention
-HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [32, 80, 128, 256]
 
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 789507615580..8c3cc8cba9d9 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -11,11 +11,11 @@
 from vllm.platforms import current_platform
 
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
-DTYPES = [torch.half, torch.bfloat16, torch.float]
+DTYPES = [torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 120, 256]
+HEAD_SIZES = [64, 80, 256]
 BLOCK_SIZES = [8, 16, 32]
 CACHE_LAYOUTS = ["NHD", "HND"]
 
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index bd3190d09b0f..2544703f8bf9 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -12,14 +12,16 @@
                                   flash_attn_with_kvcache,
                                   is_fa_version_supported)
 
-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+BLOCK_SIZES = [16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
+SOFT_CAPS = [None, 50.0]
+SLIDING_WINDOWS = [None, 256]
 
 
 def ref_paged_attn(
@@ -83,9 +85,9 @@ def ref_paged_attn(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
 @pytest.mark.parametrize("fa_version", [2, 3])
 @pytest.mark.parametrize("q_dtype", QDTYPES)
 @torch.inference_mode()
@@ -198,9 +200,9 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("fa_version", [2, 3])
 @pytest.mark.parametrize("q_dtype", QDTYPES)
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 8f9b4eceaa72..be78f0e4fcc6 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -9,11 +9,13 @@
 
 from vllm.platforms import current_platform
 
-NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
+NUM_HEADS = [(32, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+SOFT_CAPS = [None, 30.0]
+SLIDING_WINDOWS = [None, 64]
 
 
 def ref_paged_attn(
@@ -76,8 +78,8 @@ def ref_paged_attn(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
-@pytest.mark.parametrize("sliding_window", [None, 64])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
 @torch.inference_mode
 def test_flashinfer_decode_with_paged_kv(
     kv_lens: list[int],
@@ -173,8 +175,8 @@ def test_flashinfer_decode_with_paged_kv(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
-@pytest.mark.parametrize("sliding_window", [None, 64])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
 @torch.inference_mode
 def test_flashinfer_prefill_with_paged_kv(
     seq_lens: list[tuple[int, int]],
@@ -278,11 +280,11 @@ def test_flashinfer_prefill_with_paged_kv(
 
 
 @pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
-@pytest.mark.parametrize("num_heads", [(32, 8), (6, 1)])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 def test_flashinfer_prefill_with_paged_fp8_kv(
         seq_lens: list[tuple[int, int]], num_heads: tuple[int, int],
         head_size: int, dtype: torch.dtype, block_size: int,
@@ -385,11 +387,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
 
 
 @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
-@pytest.mark.parametrize("num_heads", [(32, 8), (64, 8), (6, 1)])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.skip(reason="TODO: fix the accuracy issue")
 @torch.inference_mode
 def test_flashinfer_decode_with_paged_fp8_kv(
     kv_lens: list[int],
@@ -399,7 +402,6 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     block_size: int,
     soft_cap: Optional[float],
 ) -> None:
-    pytest.skip("TODO: fix the accuracy issue")
     # test doesn't work for num_heads = (16,16)
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index e87ce520bc66..53e225ea3ea6 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -20,11 +20,11 @@
 MAX_Q_LEN = 1024
 MAX_KV_LEN = 4096
 BATCH_SIZES = [4, 12]
-NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)]
+NUM_HEADS = [(16, 16), (40, 8)]
 HEAD_SIZES = [128]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]
 KV_LAYOUTS = ["HND"]
-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 KV_CACHE_DTYPES = [None, current_platform.fp8_dtype()]
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
 SOFT_CAPS = [None, 50.0]
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index b09e1bbc4279..8544eab3accc 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -19,13 +19,13 @@
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 NUM_HEADS = [64]
-NUM_QUERIES_PER_KV = [1, 8, 64]
-HEAD_SIZES = [128, 96, 24]
+NUM_QUERIES_PER_KV = [1, 64]
+HEAD_SIZES = [24, 128]
 DTYPES = [torch.float16]
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
-SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
+SLIDING_WINDOW = [0, 16, 2048]
 KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 
 OPS = [chunked_prefill_paged_decode, context_attention_fwd]
diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
index 0cb7f5963c79..4b97d51e6ed2 100644
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -9,11 +9,11 @@
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.platforms import current_platform
 
-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]
 
-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
     None, torch.float8_e4m3fnuz
 ]
@@ -85,7 +85,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("sliding_window", [None, 256])
 @pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", [None, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("q_dtype", QDTYPES)
 @torch.inference_mode()

From f3ed2f864c219819adc6f969736542a1478e6ebe Mon Sep 17 00:00:00 2001
From: amirkl94 <203507526+amirkl94@users.noreply.github.com>
Date: Fri, 15 Aug 2025 09:19:31 +0300
Subject: [PATCH 081/233] refactor: Change scaling factors calculation for
 flashinfer FusedMoE (#22812)

Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../layers/fused_moe/fused_moe.py             | 29 +++++-------
 .../model_executor/layers/quantization/fp8.py |  5 +-
 .../layers/quantization/modelopt.py           |  5 +-
 .../quantization/utils/flashinfer_utils.py    | 46 +++++++++++++++++--
 4 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 98087a35e15c..1c497fa5521b 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1189,10 +1189,10 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
         hidden_states: torch.Tensor,
         input_scale: torch.Tensor,
         gemm1_weights: torch.Tensor,
-        gemm1_weights_scale: torch.Tensor,
-        activation_scale: torch.Tensor,
         gemm2_weights: torch.Tensor,
-        gemm2_weights_scale: torch.Tensor,
+        output1_scales_scalar: torch.Tensor,
+        output1_scales_gate_scalar: torch.Tensor,
+        output2_scales_scalar: torch.Tensor,
         num_experts: int,
         top_k: int,
         num_expert_group: Optional[int],
@@ -1206,17 +1206,12 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
     num_expert_group = num_expert_group if num_expert_group is not None else 0
     topk_group = topk_group if topk_group is not None else 0
 
-    quant_hidden_states, input_scale = moe_kernel_quantize_input(
+    quant_hidden_states, _ = moe_kernel_quantize_input(
         hidden_states,
         input_scale,
         quant_dtype=torch.float8_e4m3fn,
         per_act_token_quant=False)
 
-    output1_scales_scalar = gemm1_weights_scale * input_scale * (
-        1.0 / activation_scale)
-    output1_scales_gate_scalar = gemm1_weights_scale * input_scale
-    output2_scales_scalar = activation_scale * gemm2_weights_scale
-
     from vllm.utils.flashinfer import (
         flashinfer_trtllm_fp8_per_tensor_scale_moe)
     return flashinfer_trtllm_fp8_per_tensor_scale_moe(
@@ -1244,24 +1239,24 @@ def flashinfer_fused_moe_per_tensor_scale_fp8(
 
 def flashinfer_fused_moe_per_tensor_scale_fp8_fake(
         routing_logits: torch.Tensor,
-        routing_bias: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
         hidden_states: torch.Tensor,
+        input_scale: torch.Tensor,
         gemm1_weights: torch.Tensor,
+        gemm2_weights: torch.Tensor,
         output1_scales_scalar: torch.Tensor,
         output1_scales_gate_scalar: torch.Tensor,
-        gemm2_weights: torch.Tensor,
         output2_scales_scalar: torch.Tensor,
         num_experts: int,
         top_k: int,
-        num_expert_group: int,
-        topk_group: int,
+        num_expert_group: Optional[int],
+        topk_group: Optional[int],
         intermediate_size: int,
         local_expert_offset: int,
         local_num_experts: int,
-        routed_scaling_factor: float = 1.0,
-        use_routing_scales_on_input: bool = False,
-        tile_tokens_dim: int = 8,
-        routing_method_type: int = 0) -> torch.Tensor:
+        use_routing_scales_on_input: bool,
+        routing_method_type: int,
+        routed_scaling_factor: float = 1.0) -> torch.Tensor:
     pass
 
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5e107c799b9f..dbd523428695 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -24,8 +24,8 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
-    swap_w13_to_w31)
+    apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
+    rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -694,6 +694,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 w2_weight = layer.w2_weight.data
                 w2_weight_scale_inv = layer.w2_weight_scale_inv.data
                 if not self.block_quant:
+                    register_moe_scaling_factors(layer)
                     rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
             else:
                 w13_weight = layer.w13_weight.data
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 8f9ca73bc505..22fbbab00e91 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -25,8 +25,8 @@
     build_flashinfer_fp4_cutlass_moe_kernel,
     flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights,
-    swap_w13_to_w31)
+    apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
+    rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear, is_fp4_marlin_supported,
     prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
@@ -430,6 +430,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
             rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
                                               layer.w2_weight)
+            register_moe_scaling_factors(layer)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 9fb194767e4a..278ee5232f47 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -82,6 +82,12 @@ def apply_flashinfer_per_tensor_scale_fp8(
     apply_router_weight_on_input: bool,
 ) -> torch.Tensor:
     from flashinfer.fused_moe import RoutingMethodType
+    assert layer.output1_scales_scalar is not None, (
+        "Expected output1_scales_scalar to be initialized")
+    assert layer.output1_scales_scalar is not None, (
+        "Expected output1_scales_gate_scalar to be initialized")
+    assert layer.output1_scales_scalar is not None, (
+        "Expected output2_scales_scalar to be initialized")
 
     from vllm.model_executor.models.llama4 import Llama4MoE
     assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \
@@ -92,10 +98,10 @@ def apply_flashinfer_per_tensor_scale_fp8(
         hidden_states=hidden_states,
         input_scale=layer.w13_input_scale,
         gemm1_weights=layer.w13_weight,
-        gemm1_weights_scale=layer.w13_weight_scale,
         gemm2_weights=layer.w2_weight,
-        gemm2_weights_scale=layer.w2_weight_scale,
-        activation_scale=layer.w2_input_scale,
+        output1_scales_scalar=layer.output1_scales_scalar,
+        output1_scales_gate_scalar=layer.output1_scales_gate_scalar,
+        output2_scales_scalar=layer.output2_scales_scalar,
         num_experts=global_num_experts,
         top_k=top_k,
         num_expert_group=num_expert_group,
@@ -105,4 +111,36 @@ def apply_flashinfer_per_tensor_scale_fp8(
         local_num_experts=layer.local_num_experts,
         use_routing_scales_on_input=apply_router_weight_on_input,
         routing_method_type=RoutingMethodType.Llama4,
-    )
\ No newline at end of file
+    )
+
+
+def get_moe_scaling_factors(
+    input_scale: torch.Tensor,
+    gemm1_weights_scale: torch.Tensor,
+    activation_scale: torch.Tensor,
+    gemm2_weights_scale: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    output1_scales_scalar = gemm1_weights_scale * input_scale * (
+        1.0 / activation_scale)
+    output1_scales_gate_scalar = gemm1_weights_scale * input_scale
+    output2_scales_scalar = activation_scale * gemm2_weights_scale
+
+    return output1_scales_scalar, output1_scales_gate_scalar, \
+        output2_scales_scalar
+
+
+def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
+    output1_scales, output1_gate_scales, output2_scales = \
+        get_moe_scaling_factors(
+            layer.w13_input_scale, layer.w13_weight_scale,
+            layer.w2_input_scale, layer.w2_weight_scale
+        )
+    layer.register_parameter(
+        'output1_scales_scalar',
+        torch.nn.Parameter(output1_scales, requires_grad=False))
+    layer.register_parameter(
+        'output1_scales_gate_scalar',
+        torch.nn.Parameter(output1_gate_scales, requires_grad=False))
+    layer.register_parameter(
+        'output2_scales_scalar',
+        torch.nn.Parameter(output2_scales, requires_grad=False))

From 91a8efc4f0d43483f06d59a9f946718b9cd37713 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 15 Aug 2025 02:27:30 -0400
Subject: [PATCH 082/233] [Feature] Full Cuda Graph Support for Cutlass MLA and
 6% E2E Throughput Improvement (#22763)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 .../compile/piecewise/test_full_cudagraph.py  | 74 +++++++++++++++++++
 vllm/v1/attention/backends/mla/cutlass_mla.py | 16 +++-
 2 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index efe9c843f144..cc1a95b820a4 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -66,6 +66,80 @@ def llm_pair(request):
     )
 
 
+@pytest.fixture(scope="class")
+def cutlass_mla_llm_pair(request):
+    model = request.param
+
+    # force V1 engine and Cutlass MLA backend
+    with temporary_environ({
+            "VLLM_USE_V1": "1",
+            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
+            "FORCE_NUM_KV_SPLITS":
+            "1",  # TODO: remove this when hang issue is fixed
+    }):
+        full = LLM(
+            model=model,
+            gpu_memory_utilization=0.45,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(
+                full_cuda_graph=True,
+                cudagraph_capture_sizes=[16, 32, 64, 128, 256, 512],
+            ),
+        )
+        piecewise = LLM(
+            model=model,
+            gpu_memory_utilization=0.45,
+            trust_remote_code=True,
+            max_model_len=1024,
+            compilation_config=CompilationConfig(),
+        )
+
+    yield weakref.proxy(full), weakref.proxy(piecewise)
+    del full
+    del piecewise
+
+    wait_for_gpu_memory_to_clear(
+        devices=[0],
+        threshold_ratio=0.1,
+    )
+
+
+@pytest.mark.parametrize(
+    "cutlass_mla_llm_pair",
+    [
+        # use an MLA model
+        "deepseek-ai/DeepSeek-V2-Lite",
+    ],
+    indirect=True)
+@pytest.mark.skipif(current_platform.get_device_capability() != (10, 0),
+                    reason="Only Blackwell GPUs support Cutlass MLA")
+class TestFullCUDAGraphCutlassMLA:
+    """
+    Validate full CUDA Graph with Cutlass MLA (decode-only capture).
+    """
+
+    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
+        (8, 8),
+    ])
+    def test_full_cudagraph_sm100_cutlass_mla(
+            self, batch_size, max_tokens, cutlass_mla_llm_pair: tuple[LLM,
+                                                                      LLM]):
+        piecewise_llm, full_cudagraph_llm = cutlass_mla_llm_pair
+
+        prompts = ["Hello, my name is"] * batch_size
+        sampling_params = SamplingParams(temperature=0.0,
+                                         max_tokens=max_tokens,
+                                         top_p=0.95)
+
+        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
+        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
+
+        for piecewise_res, full_res in zip(piecewise_responses,
+                                           full_responses):
+            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
+
+
 @pytest.mark.parametrize(
     "llm_pair",
     [
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index b23a8f0a5e87..b076613c8645 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import os
-from typing import Optional
+from typing import ClassVar, Optional
 
 import torch
 
@@ -12,11 +12,19 @@
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonImpl,
-                                                   MLACommonMetadata)
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 
 logger = init_logger(__name__)
 
 
+class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
+    # enable full CUDA Graph support for decode-only capture
+    attn_cudagraph_support: ClassVar[
+        AttentionCGSupport] = AttentionCGSupport.PURE_DECODE_ONLY
+
+
 class CutlassMLABackend(MLACommonBackend):
 
     @staticmethod
@@ -27,6 +35,10 @@ def get_name() -> str:
     def get_impl_cls() -> type["CutlassMLAImpl"]:
         return CutlassMLAImpl
 
+    @staticmethod
+    def get_builder_cls() -> type["CutlassMLAMetadataBuilder"]:
+        return CutlassMLAMetadataBuilder
+
 
 class SM100Workspace:
 

From 6ee5a054f72f08b2303adc07f674ab53c9ef647b Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Fri, 15 Aug 2025 09:38:05 +0300
Subject: [PATCH 083/233] [Mamba] - refactor: Renamed mamba_attn to mamba2_attn
 (#22818)

Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
---
 tests/kernels/mamba/test_mamba_ssm_ssd.py                    | 2 +-
 tests/v1/attention/test_mamba_selectors.py                   | 2 +-
 vllm/model_executor/layers/mamba/mamba2_metadata.py          | 2 +-
 vllm/model_executor/layers/mamba/mamba_mixer2.py             | 2 +-
 vllm/v1/attention/backends/{mamba_attn.py => mamba2_attn.py} | 0
 vllm/v1/attention/backends/mamba_selectors.py                | 2 +-
 6 files changed, 5 insertions(+), 5 deletions(-)
 rename vllm/v1/attention/backends/{mamba_attn.py => mamba2_attn.py} (100%)

diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
index d2b893ffff7c..2c554baaff76 100644
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@@ -9,7 +9,7 @@
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
     mamba_chunk_scan_combined)
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.mamba_attn import (
+from vllm.v1.attention.backends.mamba2_attn import (
     _query_start_loc_to_chunk_indices_offsets)
 
 # Added by the IBM Team, 2024
diff --git a/tests/v1/attention/test_mamba_selectors.py b/tests/v1/attention/test_mamba_selectors.py
index 8eaafc5e1681..4245b50c7131 100644
--- a/tests/v1/attention/test_mamba_selectors.py
+++ b/tests/v1/attention/test_mamba_selectors.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
 from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 
 
diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
index 0a836fd17533..3256ac034aa1 100644
--- a/vllm/model_executor/layers/mamba/mamba2_metadata.py
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -11,7 +11,7 @@
     PlaceholderAttentionMetadata)
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.mamba_attn import (
+from vllm.v1.attention.backends.mamba2_attn import (
     Mamba2AttentionMetadata, _query_start_loc_to_chunk_indices_offsets)
 
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 10a5618c227e..6bf0c18ebdb4 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
-from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionMetadata
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 # Added by the IBM Team, 2024
 
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
similarity index 100%
rename from vllm/v1/attention/backends/mamba_attn.py
rename to vllm/v1/attention/backends/mamba2_attn.py
diff --git a/vllm/v1/attention/backends/mamba_selectors.py b/vllm/v1/attention/backends/mamba_selectors.py
index 852e0dfe1b31..d3a0c63c5e96 100644
--- a/vllm/v1/attention/backends/mamba_selectors.py
+++ b/vllm/v1/attention/backends/mamba_selectors.py
@@ -3,7 +3,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
-from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionBackend
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
 
 
 def get_mamba_attn_backend(mamba_type: str) -> type[AttentionBackend]:

From 3e6dfbb958aea33a8dfafd21b3d46b5ccab0bf13 Mon Sep 17 00:00:00 2001
From: TJian <tunjian1996@gmail.com>
Date: Thu, 14 Aug 2025 23:39:19 -0700
Subject: [PATCH 084/233] Revert "[ROCm][AITER] Support AITER Rope ops in
 RotaryEmbedding Module." (#22956)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../layers/rotary_embedding/base.py           |  71 ----------
 .../layers/rotary_embedding/common.py         |   4 +-
 .../rotary_embedding/deepseek_scaling_rope.py |  12 +-
 .../rotary_embedding/rocm_aiter_rope_ops.py   | 127 ------------------
 4 files changed, 10 insertions(+), 204 deletions(-)
 delete mode 100644 vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py

diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 6dfc28be7da1..10fce857a8ae 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -8,7 +8,6 @@
 from vllm.model_executor.custom_op import CustomOp
 
 from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
-from .rocm_aiter_rope_ops import is_rocm_rotary_embedding_enabled
 
 
 @CustomOp.register("rotary_embedding")
@@ -36,7 +35,6 @@ def __init__(
         cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
-        self.is_rocm_aiter_enabled = is_rocm_rotary_embedding_enabled()
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
@@ -121,75 +119,6 @@ def forward_cuda(
                                  self.cos_sin_cache, self.is_neox_style)
         return query, key
 
-    def forward_hip(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-        is_nope_first=False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        # currently only rotary embedding ops from AITER package are
-        # supported for HiP forward.
-        if self.is_rocm_aiter_enabled:
-            return self.forward_hip_rocm_aiter(positions, query, key, offsets,
-                                               is_nope_first)
-        return self.forward_native(positions, query, key, offsets)
-
-    def forward_hip_rocm_aiter(
-        self,
-        positions: torch.Tensor,
-        # if     is_nope_first
-        # [[batch_size, seq_len, num_heads, nope_size+rope_size]
-        # if NOT is_nope_first
-        # [[batch_size, seq_len, num_heads, rope_size+nope_size],
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-        is_nope_first: bool = False,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        if self.cos_sin_cache.device != query.device or \
-            self.cos_sin_cache.dtype != query.dtype:
-            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                       dtype=query.dtype)
-        cos, sin = self.cos_sin_cache.chunk(2, dim=-1)
-
-        cos = cos.unsqueeze(-2).unsqueeze(-2)
-        sin = sin.unsqueeze(-2).unsqueeze(-2)
-
-        rotate_style = 0 if self.is_neox_style else 1
-
-        num_tokens = positions.numel()
-
-        query_shape = query.shape
-        query = query.view(1, num_tokens, -1, self.head_size)
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(1, num_tokens, -1, self.head_size)
-
-        positions = positions.view(*query.shape[:2])
-        if offsets is not None:
-            offsets = offsets.view(*query.shape[:2])
-
-        if not is_nope_first:
-            query_ = query[..., :self.rotary_dim]
-            key_ = key[..., :self.rotary_dim] if key is not None else None
-        else:
-            query_ = query[..., -self.rotary_dim:]
-            key_ = key[..., -self.rotary_dim:] if key is not None else None
-
-        if key_ is None:
-            torch.ops.vllm.rocm_aiter_rotary_emb_without_key_forward_hip(
-                positions, sin, cos, query_, offsets, rotate_style,
-                is_nope_first)
-            return query.view(query_shape), None
-
-        torch.ops.vllm.rocm_aiter_rotary_emb_with_key_forward_hip(
-            positions, sin, cos, query_, key_, offsets, rotate_style,
-            is_nope_first)
-
-        return query.view(query_shape), key.view(key_shape)
-
     def forward_xpu(
         self,
         positions: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 99b6bb212033..8d821bea19e3 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -99,7 +99,7 @@ def yarn_linear_ramp_mask(low: float, high: float, dim: int,
     return ramp_func
 
 
-def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+def yarn_get_mscale(scale: float = 1) -> float:
     if scale <= 1:
         return 1.0
-    return 0.1 * mscale * math.log(scale) + 1.0
+    return 0.1 * math.log(scale) + 1.0
diff --git a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
index 5af671703a3f..cd888b733426 100644
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import math
 from typing import Optional
 
 import torch
@@ -9,7 +10,13 @@
 
 from .base import RotaryEmbedding
 from .common import (rotate_gptj, rotate_neox, yarn_find_correction_range,
-                     yarn_get_mscale, yarn_linear_ramp_mask)
+                     yarn_linear_ramp_mask)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
 
 
 class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
@@ -89,9 +96,6 @@ def forward(
         offsets: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         """PyTorch-native implementation equivalent to forward()."""
-        if self.is_rocm_aiter_enabled:
-            return self.forward_hip_rocm_aiter(positions, query, key, offsets)
-
         assert key is not None
         query_rot = query[..., :self.rotary_dim]
         key_rot = key[..., :self.rotary_dim]
diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
deleted file mode 100644
index 91a2318badb4..000000000000
--- a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional
-
-import torch
-
-import vllm.envs as envs
-from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
-
-
-def is_rocm_rotary_embedding_enabled() -> bool:
-    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER)
-
-
-def rocm_aiter_rotary_emb_without_key_forward_hip_impl(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    import aiter as ops
-    if offsets is None:
-        ops.rope_cached_positions_fwd_inplace(
-            query,
-            cos,
-            sin,
-            positions,
-            rotate_style,
-            reuse_freqs_front_part=True,
-            nope_first=is_nope_first,
-        )
-    else:
-        ops.rope_cached_positions_offsets_fwd_inplace(
-            query,
-            cos,
-            sin,
-            positions,
-            offsets,
-            rotate_style,
-            reuse_freqs_front_part=True,
-            nope_first=is_nope_first,
-        )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_hip_impl(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    import aiter as ops
-    if offsets is None:
-        ops.rope_cached_positions_2c_fwd_inplace(
-            query,
-            key,
-            cos,
-            sin,
-            positions,
-            rotate_style,
-            reuse_freqs_front_part=True,
-            nope_first=is_nope_first,
-        )
-    else:
-        ops.rope_cached_positions_offsets_2c_fwd_inplace(
-            query,
-            key,
-            cos,
-            sin,
-            positions,
-            offsets,
-            rotate_style,
-            reuse_freqs_front_part=True,
-            nope_first=is_nope_first,
-        )
-
-
-def rocm_aiter_rotary_emb_with_key_forward_hip_fake(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    pass
-
-
-def rocm_aiter_rotary_emb_without_key_forward_hip_fake(
-    positions: torch.Tensor,
-    sin: torch.Tensor,
-    cos: torch.Tensor,
-    query: torch.Tensor,
-    offsets: Optional[torch.Tensor] = None,
-    rotate_style: int = 0,
-    is_nope_first: bool = False,
-) -> None:
-    pass
-
-
-if is_rocm_rotary_embedding_enabled():
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_rotary_emb_with_key_forward_hip",
-        op_func=rocm_aiter_rotary_emb_with_key_forward_hip_impl,
-        mutates_args=["key", "query"],
-        fake_impl=rocm_aiter_rotary_emb_with_key_forward_hip_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_rotary_emb_without_key_forward_hip",
-        op_func=rocm_aiter_rotary_emb_without_key_forward_hip_impl,
-        mutates_args=["query"],
-        fake_impl=rocm_aiter_rotary_emb_without_key_forward_hip_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
\ No newline at end of file

From c20e948cc8eece6d640ce599bb34fa4e39c6e1b0 Mon Sep 17 00:00:00 2001
From: frankie <wangyongsheng686@gmail.com>
Date: Fri, 15 Aug 2025 15:01:48 +0800
Subject: [PATCH 085/233] [P/D]Provide bucket algorithm rate limiter  for
 proxy_server (#22643)

Signed-off-by: frankie-ys <yongshengwang@cmbchina.com>
Signed-off-by: frankie <wangyongsheng686@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Kuntai Du <kuntai@uchicago.edu>
---
 .../disagg_prefill_proxy_server.py            | 240 ++++++++++++++----
 benchmarks/disagg_benchmarks/rate_limiter.py  |  45 ++++
 benchmarks/disagg_benchmarks/request_queue.py |  39 +++
 3 files changed, 272 insertions(+), 52 deletions(-)
 create mode 100644 benchmarks/disagg_benchmarks/rate_limiter.py
 create mode 100644 benchmarks/disagg_benchmarks/request_queue.py

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index f62d8102e2d9..904f80534914 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,63 +1,199 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import argparse
+import asyncio
+import logging
 import os
 
 import aiohttp
-from quart import Quart, make_response, request
-
-AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
-
-app = Quart(__name__)
-
-
-async def forward_request(url, data):
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+from quart import Quart, Response, make_response, request
+from rate_limiter import RateLimiter
+from request_queue import RequestQueue
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server")
+
+    # Add args
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=300,
+        help="Timeout for backend service requests in seconds (default: 300)",
+    )
+    parser.add_argument(
+        "--max-concurrent",
+        type=int,
+        default=100,
+        help="Maximum concurrent requests to backend services (default: 100)",
+    )
+    parser.add_argument(
+        "--queue-size",
+        type=int,
+        default=500,
+        help="Maximum number of requests in the queue (default: 500)",
+    )
+    parser.add_argument(
+        "--rate-limit",
+        type=int,
+        default=40,
+        help="Maximum requests per second (default: 40)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port to run the server on (default: 8000)",
+    )
+    parser.add_argument(
+        "--prefill-url",
+        type=str,
+        default="http://localhost:8100/v1/completions",
+        help="Prefill service endpoint URL",
+    )
+    parser.add_argument(
+        "--decode-url",
+        type=str,
+        default="http://localhost:8200/v1/completions",
+        help="Decode service endpoint URL",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    """parse command line arguments"""
+    args = parse_args()
+
+    # Initialize configuration using command line parameters
+    AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
+    MAX_CONCURRENT_REQUESTS = args.max_concurrent
+    REQUEST_QUEUE_SIZE = args.queue_size
+    RATE_LIMIT = args.rate_limit
+    PREFILL_SERVICE_URL = args.prefill_url
+    DECODE_SERVICE_URL = args.decode_url
+    PORT = args.port
+
+    app = Quart(__name__)
+
+    # Initialize the rate limiter and request queue
+    rate_limiter = RateLimiter(RATE_LIMIT)
+    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
+
+    # Attach the configuration object to the application instance
+    app.config.update(
+        {
+            "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
+            "rate_limiter": rate_limiter,
+            "request_queue": request_queue,
+            "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
+            "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
+        }
+    )
+
+    # Start queue processing on app startup
+    @app.before_serving
+    async def startup():
+        """Start request processing task when app starts serving"""
+        asyncio.create_task(request_queue.process())
+
+    async def forward_request(url, data):
+        """Forward request to backend service with rate limiting and error handling"""
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-        async with session.post(url=url, json=data, headers=headers) as response:
-            if response.status == 200:
-                # if response.headers.get('Transfer-Encoding') == 'chunked':
-                if True:
-                    async for chunk_bytes in response.content.iter_chunked(1024):
-                        yield chunk_bytes
-                else:
-                    content = await response.read()
-                    yield content
-
-
-@app.route("/v1/completions", methods=["POST"])
-async def handle_request():
-    try:
-        original_request_data = await request.get_json()
-
-        prefill_request = original_request_data.copy()
-        # change max_tokens = 1 to let it only do prefill
-        prefill_request["max_tokens"] = 1
-
-        # finish prefill
-        async for _ in forward_request(
-            "http://localhost:8100/v1/completions", prefill_request
-        ):
-            continue
 
-        # return decode
-        generator = forward_request(
-            "http://localhost:8200/v1/completions", original_request_data
-        )
-        response = await make_response(generator)
-        response.timeout = None
-
-        return response
-
-    except Exception as e:
-        import sys
-        import traceback
-
-        exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server")
-        print(e)
-        print("".join(traceback.format_exception(*exc_info)))
+        # Use rate limiter as context manager
+        async with (
+            rate_limiter,
+            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+        ):
+            try:
+                async with session.post(
+                    url=url, json=data, headers=headers
+                ) as response:
+                    if response.status == 200:
+                        # Stream response chunks
+                        async for chunk_bytes in response.content.iter_chunked(1024):
+                            yield chunk_bytes
+                    else:
+                        # Handle backend service errors
+                        error_text = await response.text()
+                        logger.error(
+                            "Backend service error: %s - %s",
+                            response.status,
+                            error_text,
+                        )
+                        yield b'{"error": "Backend service error"}'
+            except aiohttp.ClientError as e:
+                # Handle connection errors
+                logger.error("Connection error to %s: %s", url, str(e))
+                yield b'{"error": "Service unavailable"}'
+            except asyncio.TimeoutError:
+                # Handle timeout errors
+                logger.error("Timeout connecting to %s", url)
+                yield b'{"error": "Service timeout"}'
+
+    async def process_request():
+        """Process a single request through prefill and decode stages"""
+        try:
+            original_request_data = await request.get_json()
+
+            # Create prefill request (max_tokens=1)
+            prefill_request = original_request_data.copy()
+            prefill_request["max_tokens"] = 1
+
+            # Execute prefill stage
+            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
+                continue
+
+            # Execute decode stage and stream response
+            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
+            response = await make_response(generator)
+            response.timeout = None  # Disable timeout for streaming response
+            return response
+
+        except Exception:
+            logger.exception("Error processing request")
+            return Response(
+                response=b'{"error": "Internal server error"}',
+                status=500,
+                content_type="application/json",
+            )
+
+    @app.route("/v1/completions", methods=["POST"])
+    async def handle_request():
+        """Handle incoming API requests with concurrency and rate limiting"""
+        # Create task for request processing
+        task = asyncio.create_task(process_request())
+
+        # Enqueue request or reject if queue is full
+        if not await request_queue.enqueue(task):
+            return Response(
+                response=b'{"error": "Server busy, try again later"}',
+                status=503,
+                content_type="application/json",
+            )
+
+        try:
+            # Return the response from the processing task
+            return await task
+        except asyncio.CancelledError:
+            # Handle task cancellation (timeout or queue full)
+            logger.warning("Request cancelled due to timeout or queue full")
+            return Response(
+                response=b'{"error": "Request cancelled"}',
+                status=503,
+                content_type="application/json",
+            )
+
+    # Start the Quart server with host can be set to 0.0.0.0
+    app.run(port=PORT)
 
 
 if __name__ == "__main__":
-    app.run(port=8000)
+    main()
diff --git a/benchmarks/disagg_benchmarks/rate_limiter.py b/benchmarks/disagg_benchmarks/rate_limiter.py
new file mode 100644
index 000000000000..87ac8cb6ab1a
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/rate_limiter.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+
+
+class RateLimiter:
+    """Token bucket rate limiter implementation"""
+
+    def __init__(self, rate_limit):
+        self.rate_limit = rate_limit  # Requests per second
+        self.num_available_tokens = rate_limit  # Available tokens
+        self.last_refill = time.monotonic()  # Last token refill time
+        self.lock = asyncio.Lock()  # Synchronization lock
+
+    async def acquire(self):
+        """Acquire a token from the rate limiter"""
+        while True:
+            async with self.lock:
+                current_time = time.monotonic()
+                elapsed = current_time - self.last_refill
+
+                # Refill num_available_tokens if more than 1 second has passed
+                if elapsed > 1.0:
+                    self.num_available_tokens = self.rate_limit
+                    self.last_refill = current_time
+
+                # Check if num_available_tokens are available
+                if self.num_available_tokens > 0:
+                    self.num_available_tokens -= 1
+                    return True
+
+                # Calculate wait time if no num_available_tokens available
+                wait_time = 1.0 - elapsed
+            await asyncio.sleep(wait_time)
+
+    async def __aenter__(self):
+        """Enter async context manager - acquire token"""
+        await self.acquire()
+        return self
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        """Exit async context manager - no cleanup needed"""
+        pass
diff --git a/benchmarks/disagg_benchmarks/request_queue.py b/benchmarks/disagg_benchmarks/request_queue.py
new file mode 100644
index 000000000000..410bcb956050
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/request_queue.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from collections import deque
+
+
+class RequestQueue:
+    """Request queue manager with concurrency control"""
+
+    def __init__(self, max_concurrent, max_queue_size):
+        # Maximum concurrent requests
+        self.max_concurrent = max_concurrent
+        self.max_queue_size = max_queue_size  # Maximum queue size
+        # Concurrency control
+        self.semaphore = asyncio.Semaphore(max_concurrent)
+        self.queue = deque()  # Request queue
+        self.queue_size = 0  # Current queue size
+        self.lock = asyncio.Lock()  # Sync queue Lock
+
+    async def enqueue(self, task):
+        """Add a request task to the queue"""
+        async with self.lock:
+            if self.queue_size >= self.max_queue_size:
+                return False
+
+            self.queue.append(task)
+            self.queue_size += 1
+            return True
+
+    async def process(self):
+        """Process queued requests using semaphore for concurrency control"""
+        while True:
+            if self.queue:
+                async with self.semaphore, self.lock:
+                    task = self.queue.popleft()
+                    self.queue_size -= 1
+                    await task
+            await asyncio.sleep(0.01)  # Yield control to event loop

From 5c1e8cec63122f844caa7a6f7402d02261afa21a Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Fri, 15 Aug 2025 16:16:15 +0800
Subject: [PATCH 086/233] [CI] Pooling models mteb test uses enforce_eager
 (#22878)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/models/language/pooling/mteb_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index d024c76dddfd..4a1f8a53d024 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -18,7 +18,7 @@
 # - Different model results in differences more than 1e-3
 # 1e-4 is a good tolerance threshold
 MTEB_EMBED_TASKS = ["STS12"]
-MTEB_EMBED_TOL = 1e-4
+MTEB_EMBED_TOL = 0.02
 
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
@@ -175,6 +175,7 @@ def mteb_test_embed_models(hf_runner,
     with vllm_runner(model_info.name,
                      runner="pooling",
                      max_model_len=None,
+                     enforce_eager=True,
                      **vllm_extra_kwargs) as vllm_model:
 
         model_config = vllm_model.llm.llm_engine.model_config
@@ -198,6 +199,7 @@ def mteb_test_embed_models(hf_runner,
         st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
         st_dtype = next(hf_model.model.parameters()).dtype
 
+    print("Model:", model_info.name)
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
@@ -286,6 +288,7 @@ def mteb_test_rerank_models(hf_runner,
                      runner="pooling",
                      max_model_len=None,
                      max_num_seqs=8,
+                     enforce_eager=True,
                      **vllm_extra_kwargs) as vllm_model:
 
         model_config = vllm_model.llm.llm_engine.model_config
@@ -304,6 +307,7 @@ def mteb_test_rerank_models(hf_runner,
     st_main_score, st_dtype = mteb_test_rerank_models_hf(
         hf_runner, model_info.name, hf_model_callback)
 
+    print("Model:", model_info.name)
     print("VLLM:", vllm_dtype, vllm_main_score)
     print("SentenceTransformers:", st_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)

From 676f6b78765ea183f2369edb0dabcdaf88bf6383 Mon Sep 17 00:00:00 2001
From: amirai21 <89905406+amirai21@users.noreply.github.com>
Date: Fri, 15 Aug 2025 11:59:52 +0300
Subject: [PATCH 087/233] [V1] - Split Prefill and Decode for Mamba1 models
 (#22653)

Signed-off-by: amirk <amirk@ai21.com>
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
---
 .../models/language/generation/test_hybrid.py |  13 +
 .../layers/mamba/mamba_mixer.py               | 309 +++++++++++++-----
 vllm/v1/attention/backends/mamba1_attn.py     |  26 +-
 3 files changed, 253 insertions(+), 95 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 19fcbf561640..e75677347f03 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -57,6 +57,13 @@
 # Avoid OOM
 MAX_NUM_SEQS = 4
 
+# Once we add support for FCG in Mamba1, this list will be removed and tests
+# all test cases will use enforce_eager=False
+ENFORCE_EAGER_MODELS_V1 = [
+    "state-spaces/mamba-130m-hf",
+    "ai21labs/Jamba-tiny-dev",
+]
+
 
 @pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
@@ -94,13 +101,19 @@ def test_models(
             example_prompts, max_tokens, num_logprobs)
 
     if model in V1_SUPPORTED_MODELS:
+        enforce_eager = False
         with monkeypatch.context() as m:
             m.setenv("VLLM_USE_V1", "1")
             if model in HYBRID_MODELS:
                 # required due to reorder_batch behaviour
                 m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+
+            if model in ENFORCE_EAGER_MODELS_V1:
+                enforce_eager = True
+
             with vllm_runner(model,
                              max_num_seqs=MAX_NUM_SEQS,
+                             enforce_eager=enforce_eager,
                              enable_prefix_caching=False) as vllm_model:
                 vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                     example_prompts, max_tokens, num_logprobs)
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 17b7f84a933f..3b17fb0ca8c7 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,13 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional
+from typing import NamedTuple, Optional
 
 import torch
 from torch import nn
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import get_current_vllm_config
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
@@ -154,13 +155,38 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
 
         self.prefix = prefix
 
+    def _ssm_transform(
+            self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if self.is_lora_enabled:
+            #  Lora kernel requires contiguous tensor.
+            ssm_params = self.x_proj(x.contiguous())[0]
+        else:
+            ssm_params = self.x_proj(x)[0]
+        time_step, B, C = torch.split(
+            ssm_params,
+            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
+            dim=-1)
+        if self.use_rms_norm:
+            assert self.dt_layernorm is not None
+            assert self.b_layernorm is not None
+            assert self.c_layernorm is not None
+            time_step = self.dt_layernorm(time_step.contiguous())
+            B = self.b_layernorm(B.contiguous())
+            C = self.c_layernorm(C.contiguous())
+        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
+        return discrete_time_step, B, C
+
     def forward(self,
                 hidden_states: torch.Tensor,
                 mamba_cache_params: Optional[MambaCacheParams] = None):
         if not envs.VLLM_USE_V1:
             return CustomOp.forward(self, hidden_states, mamba_cache_params)
         else:
-            return self.forward_cuda(hidden_states, mamba_cache_params)
+            return self.forward_cuda(
+                hidden_states,
+                mamba_cache_params,
+            )
 
     def forward_native(self,
                        hidden_states: torch.Tensor,
@@ -170,6 +196,27 @@ def forward_native(self,
     def forward_cuda(self,
                      hidden_states: torch.Tensor,
                      mamba_cache_params: Optional[MambaCacheParams] = None):
+        """
+        Run the Mamba-1 SSM pipeline.
+
+        Steps
+        -----
+        1. Apply the gated-MLP linear projection to the raw input.
+        2. Pass the projected sequence through the convolutional mixing layer.
+        3. Feed the result into the State-Space Model (SSM) blocks.
+        4. Perform the recurrence y ← SSM(A, B, C, Δ)(x)
+           to produce contextual representations.
+        5. Project the contextualised sequence back
+           to the output embedding dimension.
+
+        Batch handling
+        --------------
+        Prefill and decode tokens are processed by dedicated CUDA
+        kernels for both the convolutional (conv1d) and SSM stages.
+        In the case of a mixed batch (containing both prefill and
+        decode tokens), both sets of kernels are executed independently
+        and their outputs are concatenated before the final output projection.
+        """
 
         forward_context: ForwardContext = get_forward_context()
         attn_metadata = forward_context.attn_metadata
@@ -185,126 +232,142 @@ def forward_cuda(self,
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                 conv_state = self_kv_cache[0].transpose(-1, -2)
                 ssm_state = self_kv_cache[1]
-                has_initial_state = mamba1_metadata.has_initial_states
-                context_lens_tensor = mamba1_metadata.context_lens_tensor
+                has_initial_states = mamba1_metadata.has_initial_states
         else:
+            assert isinstance(attn_metadata, AttentionMetadata)
             assert mamba_cache_params is not None
             conv_state = mamba_cache_params.conv_state
             ssm_state = mamba_cache_params.ssm_state
             state_indices_tensor = mamba_cache_params.state_indices_tensor
             query_start_loc = attn_metadata.query_start_loc
             context_lens_tensor = attn_metadata.context_lens_tensor
-
+            has_initial_states = None
             if context_lens_tensor is not None:
-                has_initial_state = context_lens_tensor > 0
+                has_initial_states = context_lens_tensor > 0
 
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
-        hidden_states, gate = projected_states.chunk(2, dim=-2)
+        hidden_states_BC, gate = projected_states.chunk(2, dim=-2)
 
-        # 2. Convolution sequence transformation
         conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
                                                self.conv1d.weight.size(2))
 
         if envs.VLLM_USE_V1 and attn_metadata is None:
             # V1 profile run
-            hidden_states = hidden_states.contiguous()
-            return self.out_proj(hidden_states.transpose(-2, -1))[0]
-
-        if query_start_loc is not None and context_lens_tensor is not None:
-            # |---------- N-1 iteration --------|
-            # |---------------- N iteration ---------------------|
-            # |- tokenA -|......................|-- newTokens ---|
-            # |---------- context_len ----------|
-            # |-------------------- seq_len ---------------------|
-            #                                   |-- query_len ---|
-            hidden_states = causal_conv1d_fn(
-                hidden_states,
+            hidden_states_BC = hidden_states_BC.contiguous()
+            return self.out_proj(hidden_states_BC.transpose(-2, -1))[0]
+
+        num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        num_prefills = attn_metadata.num_prefills  # request count
+        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
+        has_prefill = num_prefill_tokens > 0
+        has_decode = num_decode_tokens > 0
+
+        prefill_decode_split = split_batch_to_prefill_and_decode(
+            hidden_states_BC,
+            gate,
+            state_indices_tensor,
+            query_start_loc,
+            has_initial_states,
+            num_prefill_tokens,
+            num_decode_tokens,
+            num_prefills,
+            num_decodes,
+        )
+        hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
+        hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d
+        gate_p = prefill_decode_split.gate_p
+        gate_d = prefill_decode_split.gate_d
+        state_indices_tensor_p = prefill_decode_split.state_indices_tensor_p
+        state_indices_tensor_d = prefill_decode_split.state_indices_tensor_d
+        query_start_loc_p = prefill_decode_split.query_start_loc_p
+        has_initial_states_p = prefill_decode_split.has_initial_states_p
+
+        ssm_outputs = []
+
+        if has_prefill:
+            # 2. Convolution sequence transformation
+            conv_out_p = causal_conv1d_fn(
+                hidden_states_BC_p,
                 conv_weights,
-                bias=self.conv1d.bias,
+                self.conv1d.bias,
                 activation=self.activation,
                 conv_states=conv_state,
-                has_initial_state=has_initial_state,
-                cache_indices=state_indices_tensor,
-                query_start_loc=query_start_loc)
-        else:
-            hidden_states = causal_conv1d_update(
-                hidden_states.transpose(0, 1),
+                has_initial_state=has_initial_states_p,
+                cache_indices=state_indices_tensor_p,
+                query_start_loc=query_start_loc_p)
+            # 3. State Space Model sequence transformations.
+            discrete_time_step_p, B_p, C_p = self._ssm_transform(
+                conv_out_p.transpose(-2, -1))
+            time_proj_bias = self._time_proj_bias()
+
+            # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x)
+            scan_out_p = selective_scan_fn(
+                conv_out_p,
+                ssm_state,
+                discrete_time_step_p,
+                self.A,
+                B_p.transpose(-2, -1),
+                C_p.transpose(-2, -1),
+                self.D.float(),
+                gate_p,
+                time_proj_bias,
+                delta_softplus=True,
+                cache_indices=state_indices_tensor_p,
+                has_initial_state=has_initial_states_p,
+                query_start_loc=query_start_loc_p)
+            ssm_outputs.append(scan_out_p)
+
+        if has_decode:
+            # 2. Convolution sequence transformation
+            conv_out_d = causal_conv1d_update(
+                hidden_states_BC_d.transpose(0, 1),
                 conv_state,
                 conv_weights,
                 self.conv1d.bias,
                 self.activation,
-                conv_state_indices=state_indices_tensor)
-            hidden_states = hidden_states.transpose(0, 1)
+                conv_state_indices=state_indices_tensor_d).transpose(0, 1)
 
-        # 3. State Space Model sequence transformation
-        # 3.a. input varying initialization of time_step, B and C
+            # 3. State Space Model sequence transformation.
+            discrete_time_step_d, B_d, C_d = self._ssm_transform(
+                conv_out_d.transpose(-2, -1))
+            time_proj_bias = self._time_proj_bias()
 
-        if self.is_lora_enabled:
-            #   lora kernel requires contiguous tensor
-            ssm_parameters = self.x_proj(
-                hidden_states.transpose(-2, -1).contiguous())[0]
-        else:
-            ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0]
-
-        time_step, B, C = torch.split(
-            ssm_parameters,
-            [self.time_step_rank, self.ssm_state_size, self.ssm_state_size],
-            dim=-1,
-        )
-        if self.use_rms_norm:
-            assert self.dt_layernorm is not None
-            assert self.b_layernorm is not None
-            assert self.c_layernorm is not None
-            time_step = self.dt_layernorm(time_step.contiguous())
-            B = self.b_layernorm(B.contiguous())
-            C = self.c_layernorm(C.contiguous())
-
-        discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1)
-        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
-        time_proj_bias = (self.dt_proj.bias.float() if hasattr(
-            self.dt_proj, "bias") else None)
-
-        if query_start_loc is not None and context_lens_tensor is not None:
-            scan_outputs = selective_scan_fn(
-                hidden_states,
-                ssm_state,
-                discrete_time_step,
-                self.A,
-                B.transpose(-2, -1),
-                C.transpose(-2, -1),
-                self.D.float(),
-                gate,
-                time_proj_bias,
-                delta_softplus=True,
-                cache_indices=state_indices_tensor,
-                has_initial_state=has_initial_state,
-                query_start_loc=query_start_loc)
-        else:
-            scan_outputs = torch.empty_like(hidden_states.transpose(0, 1))
+            # 4. Perform the recurrence y ← SSM(A, B, C, Δ)(x)
+            scan_outputs_d = torch.empty_like(
+                hidden_states_BC_d.transpose(0, 1))
             selective_state_update(ssm_state,
-                                   hidden_states.transpose(0, 1),
-                                   discrete_time_step.transpose(0, 1),
+                                   conv_out_d.transpose(0, 1),
+                                   discrete_time_step_d.transpose(0, 1),
                                    self.A,
-                                   B,
-                                   C,
+                                   B_d,
+                                   C_d,
                                    self.D,
-                                   gate.transpose(0, 1),
+                                   gate_d.transpose(0, 1),
                                    time_proj_bias,
                                    dt_softplus=True,
-                                   state_batch_indices=state_indices_tensor,
-                                   out=scan_outputs)
-            scan_outputs = scan_outputs.transpose(0, 1)
-
-        # 4. Final linear projection
-        if self.is_lora_enabled:
-            #  lora kernel requires contiguous tensor
-            contextualized_states = self.out_proj(
-                scan_outputs.transpose(-2, -1).contiguous())[0]
+                                   state_batch_indices=state_indices_tensor_d,
+                                   out=scan_outputs_d)
+            scan_outputs_d = scan_outputs_d.transpose(0, 1)
+
+            if envs.VLLM_USE_V1:
+                ssm_outputs.insert(0, scan_outputs_d)
+            else:
+                ssm_outputs.append(scan_outputs_d)
+
+        scan_outputs_combined = ssm_outputs[0] if len(
+            ssm_outputs) == 1 else torch.cat(ssm_outputs, dim=-1)
+
+        # 5. Final output projection
+        if self.is_lora_enabled:  # Lora kernel requires contiguous tensor.
+            scan_outputs_combined = scan_outputs_combined.transpose(
+                -2, -1).contiguous()
+            out = self.out_proj(scan_outputs_combined)[0]
         else:
-            contextualized_states = self.out_proj(
-                scan_outputs.transpose(-2, -1))[0]
-        return contextualized_states
+            out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0]
+
+        return out
 
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.mamba1_state_shape(
@@ -317,3 +380,69 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
     @property
     def mamba_type(self) -> str:
         return "mamba1"
+
+    def _time_proj_bias(self) -> Optional[torch.Tensor]:
+        if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
+            return self.dt_proj.bias.float()
+        return None
+
+
+class PrefillDecodeSplit(NamedTuple):
+    hidden_states_BC_p: torch.Tensor
+    hidden_states_BC_d: torch.Tensor
+    gate_p: torch.Tensor
+    gate_d: torch.Tensor
+    state_indices_tensor_p: torch.Tensor
+    state_indices_tensor_d: torch.Tensor
+    query_start_loc_p: Optional[torch.Tensor]
+    has_initial_states_p: Optional[torch.Tensor]
+
+
+def split_batch_to_prefill_and_decode(
+    hidden_states_BC: torch.Tensor,
+    gate: torch.Tensor,
+    state_indices_tensor: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    has_initial_states: Optional[torch.Tensor],
+    num_prefill_tokens: int,
+    num_decode_tokens: int,
+    num_prefills: int,
+    num_decodes: int,
+) -> PrefillDecodeSplit:
+    if envs.VLLM_USE_V1:
+        # In v1, decode tokens come first, then prefill tokens.
+        hidden_states_BC_d, hidden_states_BC_p = torch.split(
+            hidden_states_BC, [num_decode_tokens, num_prefill_tokens], dim=-1)
+        gate_d, gate_p = torch.split(gate,
+                                     [num_decode_tokens, num_prefill_tokens],
+                                     dim=-1)
+        state_indices_tensor_d, state_indices_tensor_p = torch.split(
+            state_indices_tensor, [num_decodes, num_prefills], dim=0)
+        query_start_loc_p = (query_start_loc[-num_prefills - 1:] -
+                             num_decodes if num_prefills > 0 else None)
+        has_initial_states_p = has_initial_states[-num_prefills:] if (
+            has_initial_states is not None and num_prefills > 0) else None
+    else:
+        # In v0, prefill tokens come first, then decode tokens.
+        hidden_states_BC_p, hidden_states_BC_d = torch.split(
+            hidden_states_BC, [num_prefill_tokens, num_decode_tokens], dim=-1)
+        gate_p, gate_d = torch.split(gate,
+                                     [num_prefill_tokens, num_decode_tokens],
+                                     dim=-1)
+        state_indices_tensor_p, state_indices_tensor_d = torch.split(
+            state_indices_tensor, [num_prefills, num_decodes], dim=0)
+        query_start_loc_p = (query_start_loc[:num_prefills +
+                                             1] if num_prefills > 0 else None)
+        has_initial_states_p = has_initial_states[:num_prefills] if (
+            has_initial_states is not None and num_prefills > 0) else None
+
+    return PrefillDecodeSplit(
+        hidden_states_BC_p=hidden_states_BC_p,
+        hidden_states_BC_d=hidden_states_BC_d,
+        gate_p=gate_p,
+        gate_d=gate_d,
+        state_indices_tensor_p=state_indices_tensor_p,
+        state_indices_tensor_d=state_indices_tensor_d,
+        query_start_loc_p=query_start_loc_p,
+        has_initial_states_p=has_initial_states_p,
+    )
diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py
index f0e4636fdb52..6cdc509083ae 100644
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@@ -2,14 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import ClassVar
+from typing import ClassVar, Optional
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
-                                              CommonAttentionMetadata)
+                                              CommonAttentionMetadata,
+                                              split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 
 
@@ -25,12 +26,15 @@ class Mamba1AttentionMetadata:
     query_start_loc: torch.Tensor
     context_lens_tensor: torch.Tensor
     state_indices_tensor: torch.Tensor
-    has_initial_states: torch.Tensor
+    has_initial_states: Optional[torch.Tensor]
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    num_decode_tokens: int
 
 
 class Mamba1AttentionMetadataBuilder(
         AttentionMetadataBuilder[Mamba1AttentionMetadata]):
-
     reorder_batch_threshold: ClassVar[int] = 1
 
     def __init__(
@@ -57,11 +61,23 @@ def build(
         state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
         context_lens_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
             query_start_loc.device)
-        has_initial_states = (context_lens_tensor > 0)
+
+        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
+            split_decodes_and_prefills(common_attn_metadata,
+                                       decode_threshold=1))
+
+        has_initial_states = None
+
+        if num_prefills > 0:
+            has_initial_states = context_lens_tensor > 0
 
         return Mamba1AttentionMetadata(
             query_start_loc=query_start_loc,
             context_lens_tensor=context_lens_tensor,
             has_initial_states=has_initial_states,
             state_indices_tensor=state_indices_tensor,
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            num_decode_tokens=num_decode_tokens,
         )

From bceccfae3f39b81512529146472ef14aaee7ff82 Mon Sep 17 00:00:00 2001
From: Sayandip Dutta <sayandip199309@gmail.com>
Date: Fri, 15 Aug 2025 14:58:00 +0530
Subject: [PATCH 088/233] [Bugfix] Unquote file uri before reading image
 (#22912)

Signed-off-by: Sayandip Dutta <sayandip199309@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/multimodal/test_utils.py | 26 ++++++++++++++++++++++++++
 vllm/multimodal/utils.py       |  3 ++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 41f4773a11c8..ea964a54383c 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -148,6 +148,32 @@ async def test_fetch_image_local_files(image_url: str):
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
+@pytest.mark.asyncio
+async def test_fetch_image_local_files_with_space_in_name():
+    image_url = TEST_IMAGE_URLS[0]
+    connector = MediaConnector()
+
+    with TemporaryDirectory() as temp_dir:
+        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
+
+        origin_image = connector.fetch_image(image_url)
+        filename = "file name with space.jpg"
+        origin_image.save(os.path.join(temp_dir, filename),
+                          quality=100,
+                          icc_profile=origin_image.info.get('icc_profile'))
+
+        try:
+            image_async = await local_connector.fetch_image_async(
+                f"file://{temp_dir}/{filename}")
+            image_sync = local_connector.fetch_image(
+                f"file://{temp_dir}/{filename}")
+        except FileNotFoundError as e:
+            pytest.fail(
+                "Failed to fetch image with space in name: {}".format(e))
+        # Check that the images are equal
+        assert not ImageChops.difference(image_sync, image_async).getbbox()
+
+
 @pytest.mark.asyncio
 async def test_fetch_image_error_conversion():
     connector = MediaConnector()
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3b01ee7ad4a4..f914d0dc6c5e 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
+from urllib.request import url2pathname
 
 import numpy as np
 import numpy.typing as npt
@@ -108,7 +109,7 @@ def _load_file_url(
             raise RuntimeError("Cannot load local files without "
                                "`--allowed-local-media-path`.")
 
-        filepath = Path(url_spec.path)
+        filepath = Path(url2pathname(url_spec.path))
         if allowed_local_media_path not in filepath.resolve().parents:
             raise ValueError(
                 f"The file path {filepath} must be a subpath "

From 5e58ff10041e99506412bb7d549633c1f81d5fb8 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Fri, 15 Aug 2025 18:10:22 +0800
Subject: [PATCH 089/233] [Bugfix] fix cuda 12.6 and 11.8 build (#22952)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 CMakeLists.txt | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c1a200d1899..dcec854a0872 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -351,8 +351,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
-    set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
-      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
 
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
@@ -366,8 +368,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_SRCS}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
-    set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
-      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
     list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
 
     message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
@@ -859,8 +863,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set_gencode_flags_for_srcs(
       SRCS "${MOE_WNAA16_MARLIN_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-    set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
-      PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
 
     list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
 

From c6ff2334a804377dcc74616734cd1ba94f6aef1e Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.me>
Date: Fri, 15 Aug 2025 04:41:38 -0700
Subject: [PATCH 090/233] [MM] Allow skipping memory profiling for multimodal
 models. (#22950)

Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/config/__init__.py            |  17 ++++-
 vllm/engine/arg_utils.py           |   4 ++
 vllm/v1/worker/gpu_model_runner.py |  84 ++++++++++++-----------
 vllm/v1/worker/tpu_model_runner.py | 106 +++++++++++++++--------------
 4 files changed, 121 insertions(+), 90 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index b4ea15ef5a0f..a2e93c344b3f 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -388,6 +388,10 @@ class ModelConfig:
     interleave_mm_strings: bool = False
     """Enable fully interleaved support for multimodal prompts, while using
     --chat-template-content-format=string. Defaults to False."""
+    skip_mm_profiling: bool = False
+    """When enabled, skips multimodal memory profiling and only profiles with
+    language backbone model during engine initialization.
+    """
     media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
     """Additional args passed to process media inputs, keyed by modalities.
     For example, to set num_frames for video, set
@@ -837,7 +841,8 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
                 media_io_kwargs=self.media_io_kwargs,
                 mm_processor_kwargs=self.mm_processor_kwargs,
                 mm_processor_cache_gb=self.mm_processor_cache_gb,
-                interleave_mm_strings=self.interleave_mm_strings)
+                interleave_mm_strings=self.interleave_mm_strings,
+                skip_mm_profiling=self.skip_mm_profiling)
 
         return None
 
@@ -2511,6 +2516,16 @@ class MultiModalConfig:
     Enable fully interleaved support for multimodal prompts.
     """
 
+    skip_mm_profiling: bool = False
+    """
+    When enabled, skips multimodal memory profiling and only profiles with 
+    language backbone model during engine initialization.
+
+    This reduces engine startup time but shifts the responsibility to users for
+    estimating the peak memory usage of the activation of multimodal encoder and
+    embedding cache.
+    """
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index dd1072da0844..31de2ede7a38 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -350,6 +350,7 @@ class EngineArgs:
         MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
     mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
+    skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     # LoRA fields
     enable_lora: bool = False
     enable_lora_bias: bool = LoRAConfig.bias_enabled
@@ -716,6 +717,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         multimodal_group.add_argument(
             "--interleave-mm-strings",
             **multimodal_kwargs["interleave_mm_strings"])
+        multimodal_group.add_argument("--skip-mm-profiling",
+                                      **multimodal_kwargs["skip_mm_profiling"])
 
         # LoRA related configs
         lora_kwargs = get_kwargs(LoRAConfig)
@@ -918,6 +921,7 @@ def create_model_config(self) -> ModelConfig:
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             interleave_mm_strings=self.interleave_mm_strings,
             media_io_kwargs=self.media_io_kwargs,
+            skip_mm_profiling=self.skip_mm_profiling,
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8fb9641844fb..703092ca9fee 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2479,50 +2479,56 @@ def _dummy_pooler_run(
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            mm_budget = self.mm_budget
-            assert mm_budget is not None
-
-            # TODO: handle encoder-decoder models once we support them.
-            if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
-                # NOTE: Currently model is profiled with a single non-text
-                # modality with the max possible input tokens even when
-                # it supports multiple.
-                (
-                    dummy_modality,
-                    max_tokens,
-                ) = mm_budget.get_modality_with_max_tokens()
-                (
-                    max_mm_items_per_prompt,
-                    max_mm_items_per_batch,
-                ) = mm_budget.get_max_items(dummy_modality, max_tokens)
-
+            if self.model_config.multimodal_config.skip_mm_profiling:
                 logger.info(
-                    "Encoder cache will be initialized with a budget of "
-                    "%s tokens, and profiled with %s %s items of the maximum "
-                    "feature size.",
-                    encoder_budget,
-                    max_mm_items_per_batch,
-                    dummy_modality,
-                )
+                    "Skipping memory profiling for multimodal encoder and "
+                    "encoder cache.")
+            else:
+                mm_budget = self.mm_budget
+                assert mm_budget is not None
+
+                # TODO: handle encoder-decoder models once we support them.
+                if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
+                    # NOTE: Currently model is profiled with a single non-text
+                    # modality with the max possible input tokens even when
+                    # it supports multiple.
+                    (
+                        dummy_modality,
+                        max_tokens,
+                    ) = mm_budget.get_modality_with_max_tokens()
+                    (
+                        max_mm_items_per_prompt,
+                        max_mm_items_per_batch,
+                    ) = mm_budget.get_max_items(dummy_modality, max_tokens)
+
+                    logger.info(
+                        "Encoder cache will be initialized with a budget of "
+                        "%s tokens, and profiled with %s %s items of the "
+                        "maximum feature size.",
+                        encoder_budget,
+                        max_mm_items_per_batch,
+                        dummy_modality,
+                    )
 
-                # Create dummy batch of multimodal inputs.
-                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
-                    dummy_modality,
-                    max_mm_items_per_batch,
-                )
+                    # Create dummy batch of multimodal inputs.
+                    batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                        dummy_modality,
+                        max_mm_items_per_batch,
+                    )
 
-                # Run multimodal encoder.
-                dummy_encoder_outputs = self.model.get_multimodal_embeddings(
-                    **batched_dummy_mm_inputs)
+                    # Run multimodal encoder.
+                    dummy_encoder_outputs = \
+                        self.model.get_multimodal_embeddings(
+                        **batched_dummy_mm_inputs)
 
-                sanity_check_mm_encoder_outputs(
-                    dummy_encoder_outputs,
-                    expected_num_items=max_mm_items_per_batch,
-                )
+                    sanity_check_mm_encoder_outputs(
+                        dummy_encoder_outputs,
+                        expected_num_items=max_mm_items_per_batch,
+                    )
 
-                # Cache the dummy encoder outputs.
-                self.encoder_cache["tmp"] = dict(
-                    enumerate(dummy_encoder_outputs))
+                    # Cache the dummy encoder outputs.
+                    self.encoder_cache["tmp"] = dict(
+                        enumerate(dummy_encoder_outputs))
 
         # Add `is_profile` here to pre-allocate communication buffers
         hidden_states, last_hidden_states \
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 46262284e333..f7e68edba3a1 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1529,60 +1529,66 @@ def profile_run(
     ) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            mm_budget = self.mm_budget
-            assert mm_budget is not None
-
-            # TODO: handle encoder-decoder models once we support them.
-            if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
-                # NOTE: Currently model is profiled with a single non-text
-                # modality with the max possible input tokens even when
-                # it supports multiple.
-                (
-                    dummy_modality,
-                    max_tokens,
-                ) = mm_budget.get_modality_with_max_tokens()
-                (
-                    max_mm_items_per_prompt,
-                    max_mm_items_per_batch,
-                ) = mm_budget.get_max_items(dummy_modality, max_tokens)
-
+            if self.model_config.multimodal_config.skip_mm_profiling:
                 logger.info(
-                    "Encoder cache will be initialized with a budget of "
-                    "%s tokens, and profiled with %s %s items of the maximum "
-                    "feature size.",
-                    encoder_budget,
-                    max_mm_items_per_batch,
-                    dummy_modality,
-                )
-
-                # Create dummy batch of multimodal inputs.
-                batched_dummy_mm_inputs = self._get_mm_dummy_batch(
-                    dummy_modality,
-                    max_mm_items_per_batch,
-                )
+                    "Skipping memory profiling for multimodal encoder and "
+                    "encoder cache.")
+            else:
+                mm_budget = self.mm_budget
+                assert mm_budget is not None
+
+                # TODO: handle encoder-decoder models once we support them.
+                if (encoder_budget := mm_budget.get_encoder_budget()) > 0:
+                    # NOTE: Currently model is profiled with a single non-text
+                    # modality with the max possible input tokens even when
+                    # it supports multiple.
+                    (
+                        dummy_modality,
+                        max_tokens,
+                    ) = mm_budget.get_modality_with_max_tokens()
+                    (
+                        max_mm_items_per_prompt,
+                        max_mm_items_per_batch,
+                    ) = mm_budget.get_max_items(dummy_modality, max_tokens)
+
+                    logger.info(
+                        "Encoder cache will be initialized with a budget of "
+                        "%s tokens, and profiled with %s %s items of the "
+                        "maximum feature size.",
+                        encoder_budget,
+                        max_mm_items_per_batch,
+                        dummy_modality,
+                    )
 
-                # Run multimodal encoder.
-                # Isolate encoder graph from post-processing to minimize
-                # impact of recompilation until it's fixed.
-                start = time.perf_counter()
-                xm.mark_step()
-                dummy_encoder_outputs = self.model.get_multimodal_embeddings(
-                    **batched_dummy_mm_inputs)
-                xm.mark_step()
-                xm.wait_device_ops()
-                end = time.perf_counter()
-                logger.info(
-                    "Multimodal Encoder profiling finished in in %.2f [secs].",
-                    end - start)
+                    # Create dummy batch of multimodal inputs.
+                    batched_dummy_mm_inputs = self._get_mm_dummy_batch(
+                        dummy_modality,
+                        max_mm_items_per_batch,
+                    )
 
-                sanity_check_mm_encoder_outputs(
-                    dummy_encoder_outputs,
-                    expected_num_items=max_mm_items_per_batch,
-                )
+                    # Run multimodal encoder.
+                    # Isolate encoder graph from post-processing to minimize
+                    # impact of recompilation until it's fixed.
+                    start = time.perf_counter()
+                    xm.mark_step()
+                    dummy_encoder_outputs = \
+                        self.model.get_multimodal_embeddings(
+                        **batched_dummy_mm_inputs)
+                    xm.mark_step()
+                    xm.wait_device_ops()
+                    end = time.perf_counter()
+                    logger.info(
+                        "Multimodal Encoder profiling finished in %.2f [secs].",
+                        end - start)
+
+                    sanity_check_mm_encoder_outputs(
+                        dummy_encoder_outputs,
+                        expected_num_items=max_mm_items_per_batch,
+                    )
 
-                # Cache the dummy encoder outputs.
-                self.encoder_cache["tmp"] = dict(
-                    enumerate(dummy_encoder_outputs))
+                    # Cache the dummy encoder outputs.
+                    self.encoder_cache["tmp"] = dict(
+                        enumerate(dummy_encoder_outputs))
 
         # Trigger compilation for general shape.
         self._dummy_run(num_tokens, self.num_reqs_max_model_len,

From 68ce86231afb2ac96be0f3e1e2474152fb91855e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Staszek=20Pa=C5=9Bko?= <staszek@gmail.com>
Date: Fri, 15 Aug 2025 14:32:56 +0200
Subject: [PATCH 091/233] Improve multimodal hasher performance for re-used
 Image prompts (#22825)

Signed-off-by: Staszek Pasko <staszek@gmail.com>
---
 tests/multimodal/test_hasher.py | 20 ++++++++++++++++++++
 vllm/multimodal/hasher.py       |  6 ++++++
 2 files changed, 26 insertions(+)

diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 42cb40739dcc..75a233c2567c 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import uuid
 from pathlib import Path
 
 import numpy as np
@@ -72,3 +73,22 @@ def test_hash_non_contiguous_array():
     hasher = MultiModalHasher
     # Both should be hashable and produce the same hashes
     assert hasher.hash_kwargs(data=arr) == hasher.hash_kwargs(data=arr_c)
+
+
+def test_hash_image_exif_id():
+    # Test that EXIF ImageId tag can be used to store UUID
+    # and the hasher will use that instead of the image data.
+    image1 = image2 = Image.new("1", size=(10, 20))
+    id = uuid.uuid4()
+    image1.getexif()[Image.ExifTags.Base.ImageID] = id
+    image2 = Image.open(ASSETS_DIR / "image1.png")
+    image2.getexif()[Image.ExifTags.Base.ImageID] = "Not a UUID"
+    image2a = Image.open(ASSETS_DIR / "image1.png")
+
+    hasher = MultiModalHasher
+    # first image has UUID in ImageID, so it should hash to that UUID
+    assert hasher.hash_kwargs(image=image1) == hasher.hash_kwargs(
+        image=id.bytes)
+    # second image has non-UUID in ImageID, so it should hash to the image data
+    assert hasher.hash_kwargs(image=image2) == hasher.hash_kwargs(
+        image=image2a)
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index ac27bb66f7b5..c9ce1f0be5f8 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pickle
+import uuid
 from collections.abc import Iterable, Mapping
 from typing import Union
 
@@ -34,6 +35,11 @@ def serialize_item(cls, obj: object) -> Union[bytes, memoryview]:
             return np.array(obj).tobytes()
 
         if isinstance(obj, Image.Image):
+            exif = obj.getexif()
+            if Image.ExifTags.Base.ImageID in exif and isinstance(
+                    exif[Image.ExifTags.Base.ImageID], uuid.UUID):
+                # If the image has exif ImageID tag, use that
+                return exif[Image.ExifTags.Base.ImageID].bytes
             return cls.item_to_bytes(
                 "image", np.asarray(convert_image_mode(obj, "RGBA")))
         if isinstance(obj, torch.Tensor):

From badff24f91368a971a2291a93f5ae52807b2c53e Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 15 Aug 2025 14:57:06 +0200
Subject: [PATCH 092/233] [V1] [Hybrid] Support using float32 for state in
 Hybrid Models (Mamba2, Mamba1, Minimax) (#22928)

Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Daniel Afrimi <danielafrimi8@gmail.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
---
 .../models/language/generation/test_hybrid.py | 62 +++++++++++++++++++
 tests/v1/worker/test_gpu_model_runner.py      |  2 +
 vllm/config/__init__.py                       |  2 +-
 vllm/config/cache.py                          | 12 ++++
 vllm/engine/arg_utils.py                      | 20 ++++--
 .../layers/mamba/mamba_mixer.py               | 17 ++++-
 .../layers/mamba/mamba_mixer2.py              | 51 +++++++++------
 .../layers/mamba/mamba_utils.py               | 52 ++++++++++++++++
 .../layers/mamba/ops/ssd_combined.py          | 10 ++-
 vllm/model_executor/models/bamba.py           | 29 +++++++--
 vllm/model_executor/models/config.py          |  2 +-
 vllm/model_executor/models/falcon_h1.py       | 29 +++++++--
 .../model_executor/models/granitemoehybrid.py | 30 +++++++--
 vllm/model_executor/models/jamba.py           | 28 +++++++--
 vllm/model_executor/models/mamba.py           | 27 ++++++--
 vllm/model_executor/models/mamba2.py          | 36 +++++++++--
 vllm/model_executor/models/mamba_cache.py     | 15 +++--
 vllm/model_executor/models/minimax_text_01.py | 34 +++++++++-
 vllm/model_executor/models/nemotron_h.py      | 32 ++++++++--
 vllm/model_executor/models/zamba2.py          | 38 ++++++++++--
 vllm/utils/__init__.py                        |  1 +
 vllm/v1/kv_cache_interface.py                 |  7 ++-
 vllm/v1/worker/gpu_model_runner.py            | 18 +++---
 23 files changed, 467 insertions(+), 87 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index e75677347f03..aee0a50336c0 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -431,3 +431,65 @@ def test_full_cuda_graph(
         name_0="hf" if hf_outputs is not None else "vllm-v0",
         name_1="vllm-v1",
     )
+
+
+@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_fp32_state(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    try:
+        model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+        model_info.check_available_online(on_fail="skip")
+        model_info.check_transformers_version(on_fail="skip")
+    except ValueError:
+        pass
+
+    with hf_runner(model) as hf_model:
+        if model not in HF_UNSUPPORTED_MODELS:
+            hf_outputs = hf_model.generate_greedy_logprobs_limit(
+                example_prompts, max_tokens, num_logprobs)
+        else:
+            hf_outputs = None
+
+    with vllm_runner(model,
+                     max_num_seqs=MAX_NUM_SEQS,
+                     mamba_ssm_cache_dtype="float32") as vllm_model:
+        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        if model in HYBRID_MODELS:
+            # required due to reorder_batch behaviour
+            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+        with vllm_runner(model,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         mamba_ssm_cache_dtype="float32",
+                         enable_prefix_caching=False) as vllm_model:
+            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
+
+    if hf_outputs is not None:
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_v0_outputs,
+            name_0="hf",
+            name_1="vllm-v0",
+        )
+
+    ref_outputs = hf_outputs if hf_outputs is not None else vllm_v0_outputs
+    check_logprobs_close(
+        outputs_0_lst=ref_outputs,
+        outputs_1_lst=vllm_v1_outputs,
+        name_0="hf" if hf_outputs is not None else "vllm-v0",
+        name_1="vllm-v1",
+    )
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index e97cdf482710..4bcc63f293e0 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -772,6 +772,8 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
                 head_dim=hf_config.mamba_d_head,
                 rms_norm_eps=hf_config.rms_norm_eps,
                 activation=hf_config.hidden_act,
+                cache_config=cache_config,
+                model_config=model_config,
                 prefix=key,
             )
         # suppress var not used error
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index a2e93c344b3f..82ef8db673fe 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -29,7 +29,7 @@
 
 import vllm.envs as envs
 from vllm import version
-from vllm.config.cache import (BlockSize, CacheConfig, CacheDType,
+from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
                                PrefixCachingHashAlgo)
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
                                      PassConfig)
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 69cb0d9732fa..ae11dec3ca5e 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -23,6 +23,7 @@
 
 BlockSize = Literal[1, 8, 16, 32, 64, 128]
 CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
+MambaDType = Literal["auto", "float32"]
 PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
 
 
@@ -93,6 +94,15 @@ class CacheConfig:
     """ Optional override for mamba page size; used by hybrid mamba/attention
     models to ensure exact alignment with attention page size."""
 
+    mamba_cache_dtype: MambaDType = "auto"
+    """The data type to use for the Mamba cache (both the conv as well as the
+    ssm state). If set to 'auto', the data type will be inferred from the model
+    config."""
+    mamba_ssm_cache_dtype: MambaDType = "auto"
+    """The data type to use for the Mamba cache (ssm state only, conv state will
+    still be controlled by mamba_cache_dtype). If set to 'auto', the data type
+    for the ssm state will be determined by mamba_cache_dtype."""
+
     # Will be set after profiling.
     num_gpu_blocks: Optional[int] = field(default=None, init=False)
     """The number of blocks to allocate for GPU memory."""
@@ -123,6 +133,8 @@ def compute_hash(self) -> str:
         """
         factors: list[Any] = []
         factors.append(self.cache_dtype)
+        factors.append(self.mamba_cache_dtype)
+        factors.append(self.mamba_ssm_cache_dtype)
         # `cpu_offload_gb` does not use `torch.compile` yet.
         hash_str = hashlib.md5(str(factors).encode(),
                                usedforsecurity=False).hexdigest()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 31de2ede7a38..f8af6d36e0c0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -27,12 +27,12 @@
                          DeviceConfig, DistributedExecutorBackend,
                          GuidedDecodingBackend, HfOverrides, KVEventsConfig,
                          KVTransferConfig, LoadConfig, LogprobsMode,
-                         LoRAConfig, ModelConfig, ModelDType, ModelImpl,
-                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PrefixCachingHashAlgo, RunnerOption,
-                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
-                         TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
-                         get_field)
+                         LoRAConfig, MambaDType, ModelConfig, ModelDType,
+                         ModelImpl, MultiModalConfig, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
+                         RunnerOption, SchedulerConfig, SchedulerPolicy,
+                         SpeculativeConfig, TaskOption, TokenizerMode,
+                         VllmConfig, get_attr_docs, get_field)
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
@@ -422,6 +422,8 @@ class EngineArgs:
     override_attention_dtype: str = ModelConfig.override_attention_dtype
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
+    mamba_cache_dtype: MambaDType = CacheConfig.mamba_cache_dtype
+    mamba_ssm_cache_dtype: MambaDType = CacheConfig.mamba_ssm_cache_dtype
 
     additional_config: dict[str, Any] = \
         get_field(VllmConfig, "additional_config")
@@ -694,6 +696,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                  **cache_kwargs["calculate_kv_scales"])
         cache_group.add_argument("--kv-sharing-fast-prefill",
                                  **cache_kwargs["kv_sharing_fast_prefill"])
+        cache_group.add_argument("--mamba-cache-dtype",
+                                 **cache_kwargs["mamba_cache_dtype"])
+        cache_group.add_argument("--mamba-ssm-cache-dtype",
+                                 **cache_kwargs["mamba_ssm_cache_dtype"])
 
         # Multimodal related configs
         multimodal_kwargs = get_kwargs(MultiModalConfig)
@@ -1105,6 +1111,8 @@ def create_engine_config(
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
             kv_sharing_fast_prefill=self.kv_sharing_fast_prefill,
+            mamba_cache_dtype=self.mamba_cache_dtype,
+            mamba_ssm_cache_dtype=self.mamba_ssm_cache_dtype,
         )
 
         ray_runtime_env = None
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 3b17fb0ca8c7..3c7322260df4 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -9,7 +9,7 @@
 
 from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import get_current_vllm_config
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 from vllm.forward_context import ForwardContext, get_forward_context
@@ -20,7 +20,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
@@ -56,6 +56,8 @@ def __init__(self,
                  rms_norm_eps: float = 1e-5,
                  activation="silu",
                  is_lora_enabled: bool = False,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
                  prefix: str = ""):
         super().__init__()
         self.time_step_rank = time_step_rank
@@ -153,6 +155,8 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
             # The inner tuple is (conv_state, ssm_state)
             self.kv_cache = [(torch.tensor([]), torch.tensor([]))]
 
+        self.model_config = model_config
+        self.cache_config = cache_config
         self.prefix = prefix
 
     def _ssm_transform(
@@ -369,6 +373,15 @@ def forward_cuda(self,
 
         return out
 
+    def get_state_dtype(self) -> tuple[torch.dtype]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.mamba1_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+            self.cache_config.mamba_ssm_cache_dtype,
+        )
+
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.mamba1_state_shape(
             tp_world_size=get_tensor_model_parallel_world_size(),
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 6bf0c18ebdb4..743e520ec8ee 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -8,7 +8,7 @@
 
 from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import get_current_vllm_config
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_gather,
@@ -21,7 +21,7 @@
 from vllm.model_executor.layers.mamba.mamba2_metadata import (Mamba2Metadata,
                                                               update_metadata)
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
     causal_conv1d_fn, causal_conv1d_update)
 from vllm.model_executor.layers.mamba.ops.layernorm_gated import rms_norm_gated
@@ -218,23 +218,23 @@ class MambaMixer2(MambaBase, CustomOp):
     **selective** state spaces)
     """
 
-    def __init__(
-        self,
-        hidden_size: int,
-        ssm_state_size: int,
-        conv_kernel_size: int,
-        intermediate_size: int,
-        use_conv_bias: bool,
-        use_bias: bool,
-        n_groups: int = 1,
-        num_heads: int = 128,
-        head_dim: int = 64,
-        rms_norm_eps: float = 1e-5,
-        activation: str = "silu",
-        use_rms_norm: bool = True,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self,
+                 hidden_size: int,
+                 ssm_state_size: int,
+                 conv_kernel_size: int,
+                 intermediate_size: int,
+                 use_conv_bias: bool,
+                 use_bias: bool,
+                 n_groups: int = 1,
+                 num_heads: int = 128,
+                 head_dim: int = 64,
+                 rms_norm_eps: float = 1e-5,
+                 activation: str = "silu",
+                 use_rms_norm: bool = True,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
 
         # For TP, the sharding plan is as follows:
@@ -417,6 +417,8 @@ def __init__(
             # The inner tuple is (conv_state, ssm_state)
             self.kv_cache = [(torch.tensor([]), torch.tensor([]))]
 
+        self.model_config = model_config
+        self.cache_config = cache_config
         self.prefix = prefix
 
     def forward_native(
@@ -670,7 +672,7 @@ def forward_cuda(
                 dt_limit=(0.0, float("inf")),
                 out=preallocated_ssm_out_p.view(1, num_prefill_tokens, -1,
                                                 self.head_dim),
-            )
+                state_dtype=ssm_state.dtype)
 
             # update ssm states
             # - varlen state is a (num_prefills, nheads, headdim, dstate) tensor
@@ -732,6 +734,15 @@ def forward_cuda(
         # 5. Final linear projection
         output[:num_actual_tokens], _ = self.out_proj(hidden_states)
 
+    def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+            self.cache_config.mamba_ssm_cache_dtype,
+        )
+
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.mamba2_state_shape(
             intermediate_size=self.intermediate_size,
diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py
index ad1401791238..66674d1a6f25 100644
--- a/vllm/model_executor/layers/mamba/mamba_utils.py
+++ b/vllm/model_executor/layers/mamba/mamba_utils.py
@@ -1,6 +1,58 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Union
+
+import torch
+
+from vllm.config import MambaDType, ModelDType
 from vllm.distributed import divide
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_kv_cache_torch_dtype
+
+
+class MambaStateDtypeCalculator:
+
+    @classmethod
+    def linear_attention_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        # TODO (tdoublep) requires testing
+        if mamba_cache_dtype == "float32":
+            raise ValueError("fp32 state for minimax is not yet supported")
+        state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype)
+        return (state_dtype, )
+
+    @classmethod
+    def mamba1_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+        mamba_ssm_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        # TODO (tdoublep) requires kernel changes
+        if mamba_cache_dtype == "float32" or mamba_ssm_cache_dtype == "float32":
+            raise ValueError("fp32 state for mamba1 is not yet supported")
+        else:
+            return MambaStateDtypeCalculator.mamba2_state_dtype(
+                model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype)
+
+    @classmethod
+    def mamba2_state_dtype(
+        cls,
+        model_dtype: Union[ModelDType, torch.dtype],
+        mamba_cache_dtype: MambaDType,
+        mamba_ssm_cache_dtype: MambaDType,
+    ) -> tuple[torch.dtype, ...]:
+        conv_state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype,
+                                                    model_dtype)
+        if mamba_ssm_cache_dtype == "auto":
+            temporal_state_dtype = conv_state_dtype
+        else:
+            temporal_state_dtype = (
+                STR_DTYPE_TO_TORCH_DTYPE[mamba_ssm_cache_dtype])
+
+        return (conv_state_dtype, temporal_state_dtype)
 
 
 class MambaStateShapeCalculator:
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
index fd74cb837290..d0b3e9e5235b 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -41,6 +41,7 @@ def _mamba_chunk_scan_combined_fwd(x,
                                    cu_seqlens=None,
                                    dt_softplus=False,
                                    dt_limit=(0.0, float("inf")),
+                                   state_dtype=None,
                                    out=None):
     assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2"
     batch, seqlen, nheads, headdim = x.shape
@@ -118,7 +119,7 @@ def _mamba_chunk_scan_combined_fwd(x,
         if initial_states is not None else None,
         seq_idx=seq_idx,
         chunk_size=chunk_size,
-        out_dtype=C.dtype,
+        out_dtype=state_dtype if state_dtype is not None else C.dtype,
         is_cont_batched=cu_seqlens is not None)
     states, final_states = (rearrange(t, "... (p n) -> ... p n", n=dstate)
                             for t in [states, final_states])
@@ -189,7 +190,8 @@ def mamba_chunk_scan_combined(x,
                               dt_limit=(0.0, float("inf")),
                               out=None,
                               return_final_states=False,
-                              return_varlen_states=False):
+                              return_varlen_states=False,
+                              state_dtype=None):
     """
     Argument:
         x: (batch, seqlen, nheads, headdim)
@@ -206,6 +208,7 @@ def mamba_chunk_scan_combined(x,
         cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
         dt_softplus: Whether to apply softplus to dt
         out: Preallocated output tensor
+        state_dtype: The data type of the ssm state
     """
 
     if not return_varlen_states:
@@ -229,7 +232,8 @@ def mamba_chunk_scan_combined(x,
         cu_seqlens=cu_seqlens,
         dt_softplus=dt_softplus,
         dt_limit=dt_limit,
-        out=out)
+        out=out,
+        state_dtype=state_dtype)
     if not return_varlen_states:
         if not return_final_states:
             return
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 4a2ae07581f3..e2cd31af5390 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -12,7 +12,7 @@
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
@@ -26,7 +26,7 @@
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -83,6 +83,7 @@ class BambaMixerDecoderLayer(nn.Module):
     def __init__(self,
                  config: BambaConfig,
                  layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
@@ -100,6 +101,8 @@ def __init__(self,
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
+                                model_config=model_config,
+                                cache_config=cache_config,
                                 quant_config=quant_config,
                                 prefix=f"{prefix}.mixer")
 
@@ -138,6 +141,7 @@ def __init__(
         self,
         config: BambaConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -266,6 +270,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config: BambaConfig = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -289,6 +294,7 @@ def get_layer(prefix: str):
             return layer_class(
                 config,
                 layer_idx,
+                model_config,
                 cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
@@ -437,6 +443,18 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -528,10 +546,13 @@ def forward(self,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 6f21cd267b0e..882df7e8162c 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -318,7 +318,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         # get mamba page size
         mamba_page_size = MambaSpec(
             shapes=model_cls.get_mamba_state_shape_from_config(vllm_config),
-            dtype=kv_cache_dtype,
+            dtypes=model_cls.get_mamba_state_dtype_from_config(vllm_config),
             block_size=model_config.max_model_len,
         ).page_size_bytes
 
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 85d64af5bd28..5e2b6d69124c 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -11,7 +11,7 @@
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
@@ -25,7 +25,7 @@
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -85,6 +85,7 @@ class FalconH1SSMDecoderLayer(nn.Module):
     def __init__(
         self,
         config: FalconH1Config,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -108,6 +109,8 @@ def __init__(
             head_dim=config.mamba_d_head,
             rms_norm_eps=config.rms_norm_eps,
             activation=config.hidden_act,
+            model_config=model_config,
+            cache_config=cache_config,
             quant_config=quant_config,
             use_rms_norm=config.mamba_rms_norm,
             prefix=f"{prefix}.mixer",
@@ -317,6 +320,7 @@ def __init__(
         self,
         config: FalconH1Config,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -339,6 +343,7 @@ def __init__(
         # Instantiate the SSM branch
         self.mamba = FalconH1SSMDecoderLayer(
             config=config,
+            model_config=model_config,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=ssm_prefix,
@@ -408,6 +413,7 @@ class FalconH1Model(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config: FalconH1Config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -435,6 +441,7 @@ def get_layer(prefix: str):
             return layer_class(
                 config,
                 layer_idx,
+                model_config,
                 cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
@@ -519,6 +526,18 @@ class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -624,12 +643,14 @@ def forward(
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(
                     self.vllm_config,
-                    self.lm_head.weight.dtype if hasattr(
-                        self.lm_head, 'weight') else torch.bfloat16,
                     self.config.num_hidden_layers,
                     *mamba_state_shape,
+                    *mamba_state_dtype,
                 )
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index e59502f12a1c..5704496b9a5d 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -12,7 +12,7 @@
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
@@ -24,7 +24,7 @@
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -50,6 +50,7 @@ class GraniteMoeHybridMambaDecoderLayer(nn.Module):
     def __init__(self,
                  config: GraniteMoeHybridConfig,
                  layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
@@ -70,6 +71,8 @@ def __init__(self,
                                 head_dim=config.mamba_d_head,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
+                                model_config=model_config,
+                                cache_config=cache_config,
                                 quant_config=quant_config,
                                 prefix=f"{prefix}.mixer")
 
@@ -137,6 +140,7 @@ def __init__(
         self,
         config: GraniteMoeHybridConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -217,6 +221,7 @@ class GraniteMoeHybridAttention(nn.Module):
     def __init__(
         self,
         config: GraniteMoeHybridConfig,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -316,6 +321,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -340,6 +346,7 @@ def get_layer(prefix: str):
             return layer_class(
                 config,
                 layer_idx,
+                model_config,
                 cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
@@ -527,6 +534,18 @@ class GraniteMoeHybridForCausalLM(nn.Module, HasInnerState, SupportsLoRA,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -625,10 +644,13 @@ def forward(self,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.model_config.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index fbd310121ad4..0b32d6f25659 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -10,7 +10,7 @@
 
 from vllm import envs
 from vllm.attention.layer import Attention
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -21,7 +21,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -94,6 +94,7 @@ class JambaMambaDecoderLayer(nn.Module):
     def __init__(self,
                  config: JambaConfig,
                  layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  is_lora_enabled: Optional[bool] = False,
@@ -114,6 +115,8 @@ def __init__(self,
                                 rms_norm_eps=config.rms_norm_eps,
                                 activation=config.hidden_act,
                                 is_lora_enabled = self.is_lora_enabled,
+                                model_config=model_config,
+                                cache_config=cache_config,
                                 prefix=f"{prefix}.mixer",
                                 )
 
@@ -164,6 +167,7 @@ class JambaAttentionDecoderLayer(nn.Module):
     def __init__(self,
                  config: JambaConfig,
                  layer_idx: int,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "",
@@ -280,6 +284,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -304,6 +309,7 @@ def get_layer(prefix: str):
                 config.layers_block_type[layer_idx]]
             return layer_class(config,
                                layer_idx,
+                               model_config,
                                cache_config,
                                quant_config=quant_config,
                                prefix=prefix,
@@ -520,9 +526,11 @@ def forward(self,
                     self.vllm_config.parallel_config, LayerBlockType.mamba)
                 state_shape = self.get_mamba_state_shape_from_config(
                     self.vllm_config)
+                state_dtype = self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
-                                                     num_layers, *state_shape)
+                                                     num_layers, *state_shape,
+                                                     *state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
@@ -537,6 +545,18 @@ def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba1_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 80b63e15377a..f4aaf0c6f467 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -9,13 +9,13 @@
 from transformers import MambaConfig
 
 from vllm import envs
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -40,6 +40,7 @@ class MambaDecoderLayer(nn.Module):
 
     def __init__(self,
                  config: MambaConfig,
+                 model_config: Optional[ModelConfig] = None,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  is_lora_enabled: Optional[bool] = False,
@@ -61,6 +62,8 @@ def __init__(self,
                                 rms_norm_eps=mixer_rms_eps,
                                 activation=config.hidden_act,
                                 is_lora_enabled=self.is_lora_enabled,
+                                model_config=model_config,
+                                cache_config=cache_config,
                                 prefix=f"{prefix}.mixer")
 
         self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
@@ -88,6 +91,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -108,6 +112,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: MambaDecoderLayer(config,
+                                             model_config=model_config,
                                              cache_config=cache_config,
                                              quant_config=quant_config,
                                              is_lora_enabled=is_lora_enabled,
@@ -243,9 +248,11 @@ def forward(self,
                     self.vllm_config.parallel_config, LayerBlockType.mamba)
                 state_shape = self.get_mamba_state_shape_from_config(
                     self.vllm_config)
+                state_dtype = self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
-                                                     num_layers, *state_shape)
+                                                     num_layers, *state_shape,
+                                                     *state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
@@ -254,6 +261,18 @@ def forward(self,
 
         return hidden_states
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba1_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 75e92b01762d..3432cf29feac 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -11,7 +11,7 @@
 from vllm import envs
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -20,7 +20,7 @@
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -45,6 +45,8 @@ class Mamba2DecoderLayer(nn.Module):
 
     def __init__(self,
                  config: MambaConfig,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
         super().__init__()
@@ -62,6 +64,8 @@ def __init__(self,
                                  head_dim=config.head_dim,
                                  rms_norm_eps=config.layer_norm_epsilon,
                                  activation=config.hidden_act,
+                                 model_config=model_config,
+                                 cache_config=cache_config,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.mixer")
 
@@ -93,6 +97,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         is_lora_enabled = bool(lora_config)
@@ -112,8 +118,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda prefix: Mamba2DecoderLayer(
-                config, quant_config=quant_config, prefix=prefix),
+            lambda prefix: Mamba2DecoderLayer(config,
+                                              model_config=model_config,
+                                              cache_config=cache_config,
+                                              quant_config=quant_config,
+                                              prefix=prefix),
             prefix=f"{prefix}.layers")
 
         self.norm_f = RMSNorm(config.hidden_size,
@@ -200,6 +209,18 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -290,10 +311,13 @@ def forward(self,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
         else:
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index 27685c59a3ea..6b16e3ce7d98 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -24,9 +24,14 @@ def at_layer_idx(self, layer_idx):
 
 class MambaCacheManager(ConstantSizeCache):
 
-    def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
-                 num_mamba_layers: int, conv_state_shape: tuple[int, int],
-                 temporal_state_shape: tuple[int, int]):
+    def __init__(self, vllm_config: VllmConfig, num_mamba_layers: int,
+                 conv_state_shape: tuple[int, int],
+                 temporal_state_shape: tuple[int, int],
+                 conv_state_dtype: torch.dtype,
+                 temporal_state_dtype: torch.dtype):
+
+        self.conv_state_dtype = conv_state_dtype
+        self.temporal_state_dtype = temporal_state_dtype
 
         # Determine max batch size to set size of MambaCache
         max_batch_size = vllm_config.scheduler_config.max_num_seqs
@@ -40,11 +45,11 @@ def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
         assert conv_state_shape[0] > conv_state_shape[1]
         conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                  (conv_state_shape[1], conv_state_shape[0]),
-                                 dtype=dtype,
+                                 dtype=self.conv_state_dtype,
                                  device="cuda").transpose(-1, -2)
         temporal_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                      temporal_state_shape,
-                                     dtype=dtype,
+                                     dtype=self.temporal_state_dtype,
                                      device="cuda")
 
         self._mamba_cache = (conv_state, temporal_state)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 3d14a6ad5c3a..82e96844cd5f 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -16,7 +16,8 @@
 
 from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
+                         get_current_vllm_config)
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_rank,
@@ -36,7 +37,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.abstract import MambaBase
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -338,6 +339,12 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
     def mamba_type(self) -> str:
         return "linear_attention"
 
+    def get_state_dtype(self) -> tuple[torch.dtype]:
+        return MambaStateDtypeCalculator.linear_attention_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+        )
+
     def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
         return MambaStateShapeCalculator.linear_attention_state_shape(
             num_heads=self.num_heads,
@@ -353,6 +360,8 @@ def __init__(
         max_position: int,
         block_size: int,
         num_hidden_layer: int,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         layer_idx: int = 0,
         linear_layer_idx: int = 0,
@@ -374,6 +383,8 @@ def __init__(
         self.tp_heads = self.total_num_heads // self.tp_size
         self.qkv_size = self.num_heads * self.head_dim
         self.tp_hidden = self.head_dim * self.tp_heads
+        self.model_config = model_config
+        self.cache_config = cache_config
         self.prefix = prefix
 
         self.qkv_proj = ColumnParallelLinear(
@@ -657,6 +668,7 @@ class MiniMaxText01DecoderLayer(nn.Module):
     def __init__(
         self,
         config: MiniMaxConfig,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         expert_num: int = 1,
@@ -693,6 +705,8 @@ def __init__(
                 max_position=max_position_embeddings,
                 block_size=config.block if hasattr(config, "block") else 256,
                 num_hidden_layer=config.num_hidden_layers,
+                model_config=model_config,
+                cache_config=cache_config,
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
                 linear_layer_idx=linear_layer_id,
@@ -861,6 +875,7 @@ class MiniMaxText01Model(nn.Module):
     def __init__(
         self,
         config: MiniMaxConfig,
+        model_config: Optional[ModelConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         scheduler_config=None,
@@ -910,6 +925,7 @@ def layer_fn(prefix):
             decoder_kwargs = {
                 "quant_config": quant_config,
                 "layer_id": layer_idx,
+                "model_config": model_config,
                 "cache_config": cache_config
             }
 
@@ -1111,8 +1127,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             self.config.max_model_len = vllm_config.model_config.max_model_len
         self.model = MiniMaxText01Model(
             self.config,
-            quant_config,
+            model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
+            quant_config=quant_config,
             scheduler_config=vllm_config.scheduler_config,
             prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:
@@ -1409,6 +1426,17 @@ def load_basic_weight(name: str, loaded_weight: torch.Tensor,
             load_basic_weight(name, loaded_weight, self)
         return loaded_params
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.linear_attention_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 08315a13853c..07cd5a4c6e24 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -26,7 +26,7 @@
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import get_forward_context
@@ -40,7 +40,7 @@
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
@@ -110,6 +110,7 @@ def __init__(
         self,
         config: NemotronHConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -149,6 +150,7 @@ def __init__(
         self,
         config: NemotronHConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -167,6 +169,8 @@ def __init__(
             head_dim=config.mamba_head_dim,
             rms_norm_eps=config.rms_norm_eps,
             activation=config.mamba_hidden_act,
+            model_config=model_config,
+            cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.mixer",
         )
@@ -198,6 +202,7 @@ def __init__(
         self,
         config: NemotronHConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -270,6 +275,7 @@ def __init__(
         self,
         config: NemotronHConfig,
         layer_idx: int,
+        model_config: Optional[ModelConfig] = None,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -279,6 +285,7 @@ def __init__(
         self.mixer = NemotronHAttention(
             config,
             layer_idx,
+            model_config,
             cache_config,
             quant_config,
             prefix=f"{prefix}.mixer",
@@ -317,6 +324,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config: NemotronHConfig = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -340,6 +348,7 @@ def get_layer(prefix: str):
             return layer_class(
                 config,
                 layer_idx,
+                model_config,
                 cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
@@ -478,6 +487,18 @@ class NemotronHForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
     embedding_padding_modules = ["lm_head"]
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -569,10 +590,13 @@ def forward(self,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 4cb0becf302f..ed65944c109b 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -18,7 +18,7 @@
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import GeluAndMul
@@ -33,7 +33,7 @@
     Mamba2Metadata, prepare_mamba2_metadata)
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
 from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateShapeCalculator)
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -478,6 +478,8 @@ class Zamba2MambaDecoderLayer(nn.Module):
 
     def __init__(self,
                  config: Zamba2Config,
+                 model_config: Optional[ModelConfig] = None,
+                 cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = "") -> None:
         """Initialize the Mamba decoder layer.
@@ -502,6 +504,8 @@ def __init__(self,
                                  config.n_mamba_heads,
                                  rms_norm_eps=config.rms_norm_eps,
                                  activation="silu",
+                                 model_config=model_config,
+                                 cache_config=cache_config,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.mixer")
 
@@ -578,6 +582,8 @@ def __init__(
         shared_transformer: Zamba2AttentionDecoderLayer,
         config: Zamba2Config,
         block_idx: int,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -596,6 +602,8 @@ def __init__(
                                        bias=False,
                                        quant_config=quant_config)
         self.mamba_decoder = Zamba2MambaDecoderLayer(config,
+                                                     model_config=model_config,
+                                                     cache_config=cache_config,
                                                      quant_config=quant_config,
                                                      prefix=prefix)
 
@@ -669,6 +677,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
         config = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
@@ -718,11 +727,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                     Zamba2HybridLayer(block,
                                       config,
                                       block_idx,
-                                      quant_config,
+                                      model_config=model_config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
                                       prefix=prefix))
             else:
                 layers.append(
                     Zamba2MambaDecoderLayer(config,
+                                            model_config=model_config,
+                                            cache_config=cache_config,
                                             quant_config=quant_config,
                                             prefix=prefix))
         self.layers = nn.ModuleList(layers)
@@ -848,6 +861,18 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
         "1.weight": "B.weight",
     })
 
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+
+        return MambaStateDtypeCalculator.mamba2_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
     @classmethod
     def get_mamba_state_shape_from_config(
         cls,
@@ -966,10 +991,13 @@ def forward(self,
                 mamba_state_shape = \
                     self.get_mamba_state_shape_from_config(
                         self.vllm_config, use_v1=False)
+                mamba_state_dtype = \
+                    self.get_mamba_state_dtype_from_config(
+                    self.vllm_config)
                 self.mamba_cache = MambaCacheManager(self.vllm_config,
-                                                     self.lm_head.weight.dtype,
                                                      num_mamba_layers,
-                                                     *mamba_state_shape)
+                                                     *mamba_state_shape,
+                                                     *mamba_state_dtype)
 
             # Get cache parameters for current run
             mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index cae4eecc0dee..a1f8ad164762 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -173,6 +173,7 @@
 RESET = '\033[0;0m'
 
 STR_DTYPE_TO_TORCH_DTYPE = {
+    "float32": torch.float32,
     "half": torch.half,
     "bfloat16": torch.bfloat16,
     "float": torch.float,
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 4ff96f9786b8..429416afa248 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -182,14 +182,15 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
 @dataclass(frozen=True)
 class MambaSpec(KVCacheSpec):
     shapes: tuple[tuple[int, ...], ...]
-    dtype: torch.dtype
+    dtypes: tuple[torch.dtype]
     page_size_padded: Optional[int] = None
     mamba_type: str = "mamba2"
 
     @property
     def page_size_bytes(self) -> int:
-        num_elements = sum(prod(shape) for shape in self.shapes)
-        page_size = num_elements * get_dtype_size(self.dtype)
+        page_size = sum(
+            prod(shape) * get_dtype_size(dtype)
+            for (shape, dtype) in zip(self.shapes, self.dtypes))
         if self.page_size_padded is not None:
             assert self.page_size_padded >= page_size
             return self.page_size_padded
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 703092ca9fee..d5325287889f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2884,23 +2884,25 @@ def _reshape_kv_cache_tensors(
                 elif isinstance(kv_cache_spec, MambaSpec):
                     has_mamba = True
                     raw_tensor = kv_cache_raw_tensors[layer_name]
-                    dtype = kv_cache_spec.dtype
-                    num_element_per_page = (kv_cache_spec.page_size_bytes //
-                                            get_dtype_size(dtype))
                     state_tensors = []
-                    storage_offset = 0
-                    for shape in kv_cache_spec.shapes:
+                    storage_offset_bytes = 0
+                    for (shape, dtype) in zip(kv_cache_spec.shapes,
+                                              kv_cache_spec.dtypes):
+                        dtype_size = get_dtype_size(dtype)
+                        num_element_per_page = (
+                            kv_cache_spec.page_size_bytes // dtype_size)
                         target_shape = (num_blocks, *shape)
                         stride = torch.empty(target_shape).stride()
                         target_stride = (num_element_per_page, *stride[1:])
+                        assert storage_offset_bytes % dtype_size == 0
                         tensor = torch.as_strided(
                             raw_tensor.view(dtype),
                             size=target_shape,
                             stride=target_stride,
-                            storage_offset=storage_offset,
+                            storage_offset=storage_offset_bytes // dtype_size,
                         )
                         state_tensors.append(tensor)
-                        storage_offset += stride[0]
+                        storage_offset_bytes += stride[0] * dtype_size
 
                     kv_caches[layer_name] = state_tensors
                 else:
@@ -3087,7 +3089,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             for layer_name, mamba_module in mamba_layers.items():
                 kv_cache_spec[layer_name] = MambaSpec(
                     shapes=mamba_module.get_state_shape(),
-                    dtype=self.kv_cache_dtype,
+                    dtypes=mamba_module.get_state_dtype(),
                     block_size=max_model_len,
                     page_size_padded=page_size_padded,
                     mamba_type=mamba_module.mamba_type)

From 3825e7589a1e02ac172f87895ac8f5cb80beedce Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 15 Aug 2025 20:58:03 +0800
Subject: [PATCH 093/233] [Misc] Ignore ep_kernels_workspace (#22807)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 721dd7536bec..465935d488f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -207,3 +207,6 @@ shellcheck*/
 
 # Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
+
+# Ignore ep_kernels_workspace folder
+ep_kernels_workspace/
\ No newline at end of file

From e36ea570243646d1b4f982b61bdc8cd01ed9e257 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 15 Aug 2025 13:58:06 +0100
Subject: [PATCH 094/233] [CI] Remove duplicated docs build from buildkite
 (#22924)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 | 10 ----------
 docker/Dockerfile                             | 11 ++++-------
 tests/standalone_tests/python_only_compile.sh |  2 +-
 3 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 942a8d3f9bfd..04d7cdc3d885 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -31,16 +31,6 @@
 steps:
 ##### fast check tests  #####
 
-- label: Documentation Build # 2min
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/test_docs"
-  fast_check: true
-  no_gpu: True
-  commands:
-  - pip install -r ../requirements/docs.txt
-  # TODO: add `--strict` once warnings in docstrings are fixed
-  - mkdocs build
-
 - label: Pytorch Nightly Dependency Override Check # 2min
   # if this test fails, it means the nightly torch version is not compatible with some
   # of the dependencies. Please check the error message and add the package to whitelist
diff --git a/docker/Dockerfile b/docker/Dockerfile
index a20a4bfb2b88..66a6e6fd6f67 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -497,14 +497,11 @@ ENV HF_HUB_ENABLE_HF_TRANSFER 1
 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 
-# doc requires source code
-# we hide them inside `test_docs/` , so that this source code
+# Source code is used in the `python_only_compile.sh` test
+# We hide it inside `src/` so that this source code
 # will not be imported by other tests
-RUN mkdir test_docs
-RUN mv docs test_docs/
-RUN cp -r examples test_docs/
-RUN mv vllm test_docs/
-RUN mv mkdocs.yaml test_docs/
+RUN mkdir src
+RUN mv vllm src/vllm
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index ec1bcbcc58a0..7cc5ef659649 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -10,7 +10,7 @@ cd /vllm-workspace/
 # uninstall vllm
 pip3 uninstall -y vllm
 # restore the original files
-mv test_docs/vllm ./vllm
+mv src/vllm ./vllm
 
 # remove all compilers
 apt remove --purge build-essential -y

From e1a5c03ef129bcd5a3660d052bb9ff584ff04336 Mon Sep 17 00:00:00 2001
From: Csrayz <jover@cmbchina.com>
Date: Fri, 15 Aug 2025 21:00:20 +0800
Subject: [PATCH 095/233] [Frontend] Expose do_log_stats interval to env
 (#22905)

Signed-off-by: Csrayz <jover@cmbchina.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/usage/troubleshooting.md         | 1 +
 vllm/entrypoints/openai/api_server.py | 2 +-
 vllm/envs.py                          | 7 +++++++
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index 9715ad66d9b3..b92c6cef4a3f 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -35,6 +35,7 @@ You can check if this is happening by trying the old defaults with `--generation
 If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
 
 - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
+- `export VLLM_LOG_STATS_INTERVAL=1.` to get log statistics more frequently for tracking running queue, waiting queue and cache hit states.
 - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
 - `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e5d31c1fd03f..af86835a497d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -126,7 +126,7 @@ async def lifespan(app: FastAPI):
 
             async def _force_log():
                 while True:
-                    await asyncio.sleep(10.)
+                    await asyncio.sleep(envs.VLLM_LOG_STATS_INTERVAL)
                     await engine_client.do_log_stats()
 
             task = asyncio.create_task(_force_log())
diff --git a/vllm/envs.py b/vllm/envs.py
index 088325c39a83..4c1e57535c7e 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -38,6 +38,7 @@
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
+    VLLM_LOG_STATS_INTERVAL: float = 10.
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
@@ -437,6 +438,12 @@ def get_vllm_port() -> Optional[int]:
     lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
     if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
 
+    # If set, vllm will log stats at this interval in seconds
+    # If not set, vllm will log stats every 10 seconds.
+    "VLLM_LOG_STATS_INTERVAL":
+    lambda: val if (val := float(os.getenv("VLLM_LOG_STATS_INTERVAL", "10.")))
+        > 0. else 10.,
+
     # Trace function calls
     # If set to 1, vllm will trace function calls
     # Useful for debugging

From dbef6b7f6fa736b81374524799a50c02ad5d5831 Mon Sep 17 00:00:00 2001
From: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Date: Fri, 15 Aug 2025 22:01:39 +0800
Subject: [PATCH 096/233] [Core] Allow full cudagraph with separate attention
 routines and orthogonal to compilation, add support for FA2 and FlashInfer
 (#20059)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: fhl <2410591650@qq.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
---
 .../compile/piecewise/test_full_cudagraph.py  | 253 ++++++-----
 tests/compile/piecewise/test_simple.py        |  33 +-
 tests/compile/piecewise/test_toy_llama.py     |  36 +-
 tests/v1/cudagraph/__init__.py                |   0
 tests/v1/cudagraph/test_cudagraph_dispatch.py | 406 ++++++++++++++++++
 tests/v1/cudagraph/test_cudagraph_mode.py     | 187 ++++++++
 vllm/compilation/backends.py                  |  42 +-
 vllm/compilation/base_piecewise_backend.py    |  72 ----
 vllm/compilation/base_static_graph.py         |  54 +++
 vllm/compilation/cuda_graph.py                | 193 +++++++++
 vllm/compilation/cuda_piecewise_backend.py    | 133 +-----
 vllm/compilation/monitor.py                   |  18 +
 vllm/compilation/wrapper.py                   |   7 +-
 vllm/config/__init__.py                       |  52 ++-
 vllm/config/compilation.py                    | 188 ++++++--
 vllm/forward_context.py                       |  52 ++-
 vllm/platforms/cuda.py                        |  13 +-
 vllm/platforms/interface.py                   |  19 +-
 vllm/platforms/rocm.py                        |   4 +-
 vllm/platforms/tpu.py                         |  12 +-
 vllm/platforms/xpu.py                         |  22 +-
 vllm/v1/attention/backends/flash_attn.py      |  68 +--
 vllm/v1/attention/backends/flashinfer.py      |  13 +-
 vllm/v1/attention/backends/mamba2_attn.py     |   8 +-
 vllm/v1/attention/backends/mla/common.py      |   6 +-
 vllm/v1/attention/backends/mla/cutlass_mla.py |   2 +-
 vllm/v1/attention/backends/mla/flashmla.py    |  11 +-
 .../attention/backends/mla/rocm_aiter_mla.py  |  13 +-
 vllm/v1/attention/backends/rocm_aiter_fa.py   |   5 -
 vllm/v1/attention/backends/triton_attn.py     |   8 +-
 vllm/v1/attention/backends/utils.py           |  24 +-
 vllm/v1/cudagraph_dispatcher.py               | 120 ++++++
 vllm/v1/worker/gpu_model_runner.py            | 359 ++++++++++++----
 vllm/v1/worker/gpu_worker.py                  |   5 -
 34 files changed, 1840 insertions(+), 598 deletions(-)
 create mode 100644 tests/v1/cudagraph/__init__.py
 create mode 100644 tests/v1/cudagraph/test_cudagraph_dispatch.py
 create mode 100644 tests/v1/cudagraph/test_cudagraph_mode.py
 delete mode 100644 vllm/compilation/base_piecewise_backend.py
 create mode 100644 vllm/compilation/base_static_graph.py
 create mode 100644 vllm/compilation/cuda_graph.py
 create mode 100644 vllm/v1/cudagraph_dispatcher.py

diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
index cc1a95b820a4..97140a9db7af 100644
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -3,7 +3,8 @@
 import contextlib
 import os
 import weakref
-from contextlib import ExitStack
+from dataclasses import dataclass
+from typing import Optional
 
 import pytest
 
@@ -32,69 +33,133 @@ def temporary_environ(env_vars):
                 os.environ[k] = v
 
 
-@pytest.fixture(scope="class")
-def llm_pair(request):
-    model = request.param
-
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION": "3"
-    }):
-        full = LLM(
-            model=model,
-            gpu_memory_utilization=0.45,
-            trust_remote_code=True,
-            max_model_len=1024,
-            compilation_config=CompilationConfig(full_cuda_graph=True),
-        )
-        piecewise = LLM(
-            model=model,
-            gpu_memory_utilization=0.45,
-            trust_remote_code=True,
-            max_model_len=1024,
-            compilation_config=CompilationConfig(),
-        )
-
-    # PyTest caches the fixture values so we use weakref.proxy to enable GC
-    yield weakref.proxy(full), weakref.proxy(piecewise)
-    del full
-    del piecewise
-
-    wait_for_gpu_memory_to_clear(
-        devices=[0],
-        threshold_ratio=0.1,
-    )
-
-
-@pytest.fixture(scope="class")
-def cutlass_mla_llm_pair(request):
-    model = request.param
-
-    # force V1 engine and Cutlass MLA backend
-    with temporary_environ({
+@dataclass
+class BackendConfig:
+    name: str
+    env_vars: dict
+    comp_config: dict
+    specific_gpu_arch: Optional[tuple] = None
+
+
+# Define all backend configurations of full cudagraph to be tested
+backend_configs = {
+    # FA3 on Hopper
+    "FA3":
+    BackendConfig(name="FA3",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # FlashMLA on Hopper
+    "FlashMLA":
+    BackendConfig(name="FlashMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # Cutlass MLA on Blackwell
+    "CutlassMLA":
+    BackendConfig(
+        name="CutlassMLA",
+        env_vars={
             "VLLM_USE_V1": "1",
             "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
             "FORCE_NUM_KV_SPLITS":
             "1",  # TODO: remove this when hang issue is fixed
-    }):
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+            "cudagraph_capture_sizes": [16, 32, 64, 128, 256, 512],
+        },
+        specific_gpu_arch=(10, 0)),
+    # FA2
+    "FA2":
+    BackendConfig(name="FA2",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  }),
+    # Triton Attention
+    "TritonAttn":
+    BackendConfig(name="TritonAttn",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  }),
+    # FlashInfer
+    "FlashInfer":
+    BackendConfig(name="FlashInfer",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+}
+
+test_params_full_cudagraph = []
+
+# deepseek-ai/DeepSeek-V2-Lite with MLA
+MLA_backends = ["FlashMLA", "CutlassMLA"]
+for mla_backend in MLA_backends:
+    test_params_full_cudagraph.append(
+        pytest.param(
+            ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])))
+
+# Qwen/Qwen2-1.5B-Instruct with other backends
+other_backend_configs = [
+    backend_configs[c] for c in backend_configs if c not in MLA_backends
+]
+for backend_config in other_backend_configs:
+    test_params_full_cudagraph.append(
+        pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)))
+
+
+@pytest.fixture(scope="class")
+def llm_pair(request):
+    model, backend_config = request.param
+
+    # Dynamically skip test if GPU capability is not met
+    if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
+        != current_platform.get_device_capability():
+        if backend_config.specific_gpu_arch == (9, 0):
+            pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
+        elif backend_config.specific_gpu_arch == (10, 0):
+            pytest.skip("Only Blackwell GPUs support Cutlass MLA")
+
+    env_vars = {
+        "VLLM_USE_V1": "1",
+        # Force native sampler to avoid potential nondeterminism in FlashInfer
+        # when per-request generators are not used in V1.
+        "VLLM_USE_FLASHINFER_SAMPLER": "0",
+        **backend_config.env_vars,
+    }
+    with temporary_environ(env_vars):
         full = LLM(
             model=model,
-            gpu_memory_utilization=0.45,
+            gpu_memory_utilization=0.43,
             trust_remote_code=True,
             max_model_len=1024,
-            compilation_config=CompilationConfig(
-                full_cuda_graph=True,
-                cudagraph_capture_sizes=[16, 32, 64, 128, 256, 512],
-            ),
+            max_num_seqs=128,
+            compilation_config=\
+                CompilationConfig(**backend_config.comp_config),
+            generation_config="vllm",
+            seed=42,
         )
         piecewise = LLM(
             model=model,
-            gpu_memory_utilization=0.45,
+            gpu_memory_utilization=0.43,
             trust_remote_code=True,
             max_model_len=1024,
-            compilation_config=CompilationConfig(),
+            max_num_seqs=128,
+            compilation_config=CompilationConfig(cudagraph_mode="PIECEWISE"),
+            generation_config="vllm",
+            seed=42,
         )
 
+    # PyTest caches the fixture values so we use weakref.proxy to enable GC
     yield weakref.proxy(full), weakref.proxy(piecewise)
     del full
     del piecewise
@@ -105,51 +170,7 @@ def cutlass_mla_llm_pair(request):
     )
 
 
-@pytest.mark.parametrize(
-    "cutlass_mla_llm_pair",
-    [
-        # use an MLA model
-        "deepseek-ai/DeepSeek-V2-Lite",
-    ],
-    indirect=True)
-@pytest.mark.skipif(current_platform.get_device_capability() != (10, 0),
-                    reason="Only Blackwell GPUs support Cutlass MLA")
-class TestFullCUDAGraphCutlassMLA:
-    """
-    Validate full CUDA Graph with Cutlass MLA (decode-only capture).
-    """
-
-    @pytest.mark.parametrize(("batch_size", "max_tokens"), [
-        (8, 8),
-    ])
-    def test_full_cudagraph_sm100_cutlass_mla(
-            self, batch_size, max_tokens, cutlass_mla_llm_pair: tuple[LLM,
-                                                                      LLM]):
-        piecewise_llm, full_cudagraph_llm = cutlass_mla_llm_pair
-
-        prompts = ["Hello, my name is"] * batch_size
-        sampling_params = SamplingParams(temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         top_p=0.95)
-
-        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
-        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
-
-        for piecewise_res, full_res in zip(piecewise_responses,
-                                           full_responses):
-            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
-
-
-@pytest.mark.parametrize(
-    "llm_pair",
-    [
-        # Model names for the llm_pair fixture
-        "deepseek-ai/DeepSeek-V2-Lite",
-        "Qwen/Qwen2-1.5B-Instruct"
-    ],
-    indirect=True)
-@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
-                    reason="Only Hopper GPUs support FA3 and FlashMLA")
+@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
 class TestFullCUDAGraph:
     """
     Use a class such that an llm pair is constructed once for all
@@ -178,12 +199,14 @@ def test_full_cudagraph(self, batch_size, max_tokens,
         full cudagraph compilation works for padded cases too.
         """
 
-        piecewise_llm, full_cudagraph_llm = llm_pair
+        full_cudagraph_llm, piecewise_llm = llm_pair
 
-        prompts = ["Hello, my name is"] * batch_size
+        prompts = ["the quick brown fox"] * batch_size
+        # Use purely greedy decoding to avoid top-p truncation sensitivity
+        # that can amplify tiny numeric differences across runtimes.
         sampling_params = SamplingParams(temperature=0.0,
                                          max_tokens=max_tokens,
-                                         top_p=0.95)
+                                         top_p=1.0)
 
         piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
         full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
@@ -191,42 +214,16 @@ def test_full_cudagraph(self, batch_size, max_tokens,
         # Check that all responses are the same
         for piecewise_res, full_res in zip(piecewise_responses,
                                            full_responses):
-            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
-
-
-@pytest.mark.parametrize(
-    "model, supported",
-    [
-        ("Qwen/Qwen2-1.5B-Instruct", True),
-        # MLA does not support capturing CUDA Graphs with size > max_num_seqs
-        ("deepseek-ai/DeepSeek-V2-Lite", False),
-    ])
-@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
-                    reason="Only Hopper GPUs support FA3 and FlashMLA")
-def test_lower_max_num_seqs(model, supported):
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION": "3"
-    }), ExitStack() as stack:
-        if not supported:
-            stack.enter_context(pytest.raises(RuntimeError))
-
-        llm = LLM(model=model,
-                  max_num_seqs=256,
-                  trust_remote_code=True,
-                  max_model_len=1024,
-                  compilation_config=CompilationConfig(
-                      full_cuda_graph=True,
-                      cudagraph_capture_sizes=[64, 256, 512]))
-        llm.generate(["Hello, my name is"] * 10)
+            assert piecewise_res.outputs[0].text.lower() == \
+                full_res.outputs[0].text.lower()
 
 
 @pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
     with temporary_environ({
             "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION":
-            "2"  #FA2 not supported with full_cuda_graph
+            "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION"
+            # Flex_Attention is not supported with full cuda graph
     }), pytest.raises(RuntimeError):
         LLM(model="Qwen/Qwen2-1.5B-Instruct",
-            compilation_config=CompilationConfig(full_cuda_graph=True))
+            compilation_config=CompilationConfig(cudagraph_mode="FULL"))
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 06ac3527e1fb..2d1a72d44ec7 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -11,10 +11,10 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
 from vllm.envs import VLLM_USE_V1
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op
 
 global_counter = 0
@@ -101,16 +101,33 @@ def test_simple_piecewise_compile(use_inductor):
             num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
             num_cudagraph_captured=
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-
+    ), set_forward_context(None,
+                           vllm_config=vllm_config):  # background context
+        # warm up with background context
         model(inputs)
 
-        model(torch.randn(2).cuda())
-        model(torch.randn(1).cuda())
+        # capturing/replaying should under context of cudagraph dispatching
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=2, )):
+            model(torch.randn(2).cuda())
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=1, )):
+            model(torch.randn(1).cuda())
 
         input = torch.zeros(2).cuda()
         global global_counter
         global_counter = 0
-        output = model(input)
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=2, )):
+            output = model(input)
         assert global_counter == 2
         assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index b7ed8353b3ce..bcfd0d834c5d 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -18,9 +18,9 @@
 
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
-from vllm.forward_context import set_forward_context
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
@@ -276,9 +276,11 @@ def run_model(llama_config,
         )
         if split_attn:
             compilation_config.splitting_ops = ["silly.attention"]
+        cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
     else:
         compilation_config = CompilationConfig(
             level=CompilationLevel.NO_COMPILATION, )
+        cudagraph_runtime_mode = CUDAGraphMode.NONE
 
     vllm_config = VllmConfig(compilation_config=compilation_config,
                              additional_config=llama_config)
@@ -287,17 +289,37 @@ def run_model(llama_config,
                            vllm_config=vllm_config,
                            prefix="").eval().cuda()
 
-    with set_forward_context({}, vllm_config=vllm_config):
+    with set_forward_context({},
+                             vllm_config=vllm_config):  # background context
         B = 16  # max batch size
         input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
         positions = torch.arange(B).cuda()
 
+        # warmup for the model with cudagraph_mode NONE
         model(input_ids, positions)
-        model(input_ids[:2], positions[:2])
-        model(input_ids[:1], positions[:1])
+
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(input_ids[:2], positions[:2])
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(input_ids[:1], positions[:1])
 
         input_ids[:2].zero_()
-        output = model(input_ids[:2], positions[:2])
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(input_ids[:2], positions[:2])
 
         output = output.cpu()
 
diff --git a/tests/v1/cudagraph/__init__.py b/tests/v1/cudagraph/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
new file mode 100644
index 000000000000..64f2fa462802
--- /dev/null
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+
+from tests.utils import create_new_process_for_each_test
+from vllm.compilation.cuda_graph import CUDAGraphWrapper
+from vllm.compilation.monitor import set_cudagraph_capturing_enabled
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.platforms import current_platform
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
+
+
+# Helper MLP for testing
+class SimpleMLP(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(10, 10)
+        self.fc2 = nn.Linear(10, 10)
+
+    def forward(self, x):
+        return self.fc2(self.fc1(x))
+
+
+def _create_vllm_config(compilation_config: CompilationConfig,
+                        max_num_seqs: int = 8) -> MagicMock:
+    mock_config = MagicMock(spec=VllmConfig)
+    mock_config.compilation_config = compilation_config
+    mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
+    mock_config.parallel_config = ParallelConfig()
+
+    # Mimic the behavior of VllmConfig.__post_init__()
+    if compilation_config.level == CompilationLevel.PIECEWISE:
+        compilation_config.set_splitting_ops_for_v1()
+
+    return mock_config
+
+
+class TestCudagraphDispatcher:
+
+    @pytest.mark.parametrize(
+        "params",
+        [
+            # Test case 0: Full CG for mixed batches, no separate routine
+            {
+                "case_id": 0,
+                "cudagraph_mode": "FULL",
+                "compilation_level": CompilationLevel.NO_COMPILATION,
+            },
+            # Test case 1: Full CG for uniform batches, piecewise for mixed
+            {
+                "case_id": 1,
+                "cudagraph_mode": "FULL_AND_PIECEWISE",
+                "compilation_level": CompilationLevel.PIECEWISE,
+            },
+            # Test case 2: Full CG for uniform batches, no CG for mixed
+            {
+                "case_id": 2,
+                "cudagraph_mode": "FULL_DECODE_ONLY",
+                "compilation_level": CompilationLevel.NO_COMPILATION,
+            },
+            # Test case 3: Piecewise for all
+            {
+                "case_id": 3,
+                "cudagraph_mode": "PIECEWISE",
+                "compilation_level": CompilationLevel.PIECEWISE,
+            },
+        ])
+    def test_dispatcher(self, params):
+        # Setup dispatcher
+        comp_config = CompilationConfig(
+            cudagraph_mode=params["cudagraph_mode"],
+            level=params["compilation_level"],
+            cudagraph_capture_sizes=[1, 8])
+
+        config = _create_vllm_config(comp_config, max_num_seqs=8)
+        dispatcher = CudagraphDispatcher(config)
+        dispatcher.initialize_cudagraph_keys(
+            cudagraph_mode=comp_config.cudagraph_mode,
+            uniform_decode_query_len=1)
+
+        # Verify the key is initialized correctly
+        if params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 2
+        else:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.PIECEWISE]) == 0
+        if params["cudagraph_mode"] not in ["NONE", "PIECEWISE"]:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 2
+        else:
+            assert len(dispatcher.cudagraph_keys[CUDAGraphMode.FULL]) == 0
+
+        # Test dispatch logic
+        # 1. non-uniform batch, size in cudagraph size list
+        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
+        rt_mode, key = dispatcher.dispatch(desc_full_exact)
+        if params["cudagraph_mode"] == "FULL":
+            assert rt_mode == CUDAGraphMode.FULL
+            assert key == desc_full_exact
+        elif params["cudagraph_mode"] in ["FULL_AND_PIECEWISE", "PIECEWISE"]:
+            assert rt_mode == CUDAGraphMode.PIECEWISE
+            assert key == desc_full_exact
+        else:
+            assert rt_mode == CUDAGraphMode.NONE
+
+        # 2. uniform decode batch, size in cudagraph size list
+        desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True)
+        rt_mode, key = dispatcher.dispatch(desc_uniform_exact)
+        if params["cudagraph_mode"] == "FULL":
+            assert rt_mode == CUDAGraphMode.FULL
+            assert key == desc_uniform_exact.non_uniform
+        elif params["cudagraph_mode"] in [
+                "FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"
+        ]:
+            assert rt_mode == CUDAGraphMode.FULL
+            assert key == desc_uniform_exact
+        elif params["cudagraph_mode"] == "PIECEWISE":
+            assert rt_mode == CUDAGraphMode.PIECEWISE
+            assert key == desc_uniform_exact.non_uniform
+        else:
+            assert rt_mode == CUDAGraphMode.NONE
+
+        # 3. No key match
+        desc_no_match = BatchDescriptor(num_tokens=15, uniform_decode=False)
+        rt_mode, key = dispatcher.dispatch(desc_no_match)
+        assert rt_mode == CUDAGraphMode.NONE
+        assert key is None
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+class TestCUDAGraphWrapper:
+
+    def setup_method(self):
+        self.vllm_config = _create_vllm_config(CompilationConfig())
+        self.model = SimpleMLP().to("cuda")
+        self.persistent_input_buffer = torch.zeros(1, 10, device="cuda")
+        self.input_tensor = torch.randn(1, 10, device="cuda")
+
+    @create_new_process_for_each_test("spawn")
+    def test_capture_and_replay(self):
+        wrapper = CUDAGraphWrapper(self.model,
+                                   self.vllm_config,
+                                   runtime_mode=CUDAGraphMode.FULL)
+        batch_descriptor = BatchDescriptor(num_tokens=10)
+
+        # 0. global warmup
+        with set_forward_context(attn_metadata=None,
+                                 vllm_config=self.vllm_config,
+                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                                 batch_descriptor=None):
+            wrapper(self.input_tensor)
+
+        # 1. Capture
+        with set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.FULL,
+                batch_descriptor=batch_descriptor),\
+            patch("torch.cuda.graph",
+                       wraps=torch.cuda.graph) as mock_cuda_graph:
+            output1 = wrapper(self.input_tensor)
+            # capturing phase should generate a zero output
+            assert torch.allclose(output1, torch.zeros_like(output1))
+            mock_cuda_graph.assert_called_once()
+
+        assert batch_descriptor in wrapper.concrete_cudagraph_entries
+        entry = wrapper.concrete_cudagraph_entries[batch_descriptor]
+        assert entry.cudagraph is not None
+
+        # 2. Replay
+        with set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.FULL,
+                batch_descriptor=batch_descriptor),\
+            patch.object(entry.cudagraph, 'replay',
+                         wraps=entry.cudagraph.replay) as mock_replay:
+            output2 = wrapper(self.input_tensor)
+            mock_replay.assert_called_once()
+
+        # Compare with eager output
+        eager_output = self.model(self.input_tensor)
+        torch.testing.assert_close(eager_output, output2)
+
+    @create_new_process_for_each_test("spawn")
+    def test_bypass_on_mode_mismatch(self):
+        wrapper = CUDAGraphWrapper(self.model,
+                                   self.vllm_config,
+                                   runtime_mode=CUDAGraphMode.FULL)
+        batch_descriptor = BatchDescriptor(num_tokens=10)
+
+        with set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=batch_descriptor), \
+            patch('torch.cuda.graph',
+                  wraps=torch.cuda.graph) as mock_cuda_graph, \
+            patch.object(self.model, 'forward',
+                         wraps=self.model.forward) as mock_forward:
+            wrapper(self.input_tensor)
+            mock_cuda_graph.assert_not_called()
+            mock_forward.assert_called_once()
+        assert not wrapper.concrete_cudagraph_entries
+
+    @create_new_process_for_each_test("spawn")
+    def test_bypass_on_mode_none(self):
+        wrapper = CUDAGraphWrapper(self.model,
+                                   self.vllm_config,
+                                   runtime_mode=CUDAGraphMode.FULL)
+        batch_descriptor = BatchDescriptor(num_tokens=10)
+
+        with set_forward_context(
+                attn_metadata=None,
+                vllm_config=self.vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                batch_descriptor=batch_descriptor), \
+            patch('torch.cuda.graph',
+                  wraps=torch.cuda.graph) as mock_cuda_graph:
+            wrapper(self.input_tensor)
+            mock_cuda_graph.assert_not_called()
+        assert not wrapper.concrete_cudagraph_entries
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+class TestCudagraphIntegration:
+
+    def setup_method(self):
+        # only FULL mode for non-uniform batches
+        self.comp_config = CompilationConfig(level=CompilationLevel.PIECEWISE,
+                                             cudagraph_mode="FULL",
+                                             cudagraph_capture_sizes=[10, 20])
+        self.vllm_config = _create_vllm_config(self.comp_config)
+        self.dispatcher = CudagraphDispatcher(self.vllm_config)
+        self.dispatcher.initialize_cudagraph_keys(
+            self.comp_config.cudagraph_mode, uniform_decode_query_len=1)
+
+    def _run_and_monitor_call(self, wrapper, input_tensor, runtime_mode,
+                              batch_descriptor):
+        """Helper to run a single call and monitor the action."""
+
+        with patch('torch.cuda.graph',
+                wraps=torch.cuda.graph) as mock_graph_context, \
+            patch.object(wrapper, 'runnable',
+                        wraps=wrapper.runnable) as mock_runnable:
+
+            entry = wrapper.concrete_cudagraph_entries.get(
+                batch_descriptor, None)
+
+            context = set_forward_context(attn_metadata=None,
+                                          vllm_config=self.vllm_config,
+                                          cudagraph_runtime_mode=runtime_mode,
+                                          batch_descriptor=batch_descriptor)
+            mock_replay = MagicMock()
+            if entry and entry.cudagraph:
+                with context, \
+                    patch.object(entry.cudagraph, 'replay',
+                                new_callable=MagicMock) as mock_replay:
+                    wrapper(input_tensor)
+            else:
+                with context:
+                    wrapper(input_tensor)
+
+            if mock_graph_context.called:
+                # note that this is globally mocked, so it will be detected
+                # even whether called by the inner or outer wrapper
+                return "capture_global"
+            if mock_replay.called:
+                # only for outer wrapper
+                return "replay"
+            if mock_runnable.call_count > 0:
+                # only for outer wrapper
+                return "bypass"
+            return "unknown"
+
+    @create_new_process_for_each_test("spawn")
+    def test_capture_replay_bypass_logic(self):
+        model = SimpleMLP().to("cuda")
+        full_wrapper = CUDAGraphWrapper(model, self.vllm_config,
+                                        CUDAGraphMode.FULL)
+        max_bs = 16
+        persistent_input_buffer = torch.zeros(max_bs, 10, device="cuda")
+        input_1 = persistent_input_buffer[:1]
+        input_2 = persistent_input_buffer[:2]
+        input_3 = persistent_input_buffer[:3]
+
+        desc_1 = BatchDescriptor(num_tokens=1)
+        desc_2 = BatchDescriptor(num_tokens=2)
+        desc_3_unseen = BatchDescriptor(num_tokens=3)
+
+        # 0. global warmup
+        with set_forward_context(attn_metadata=None,
+                                 vllm_config=self.vllm_config,
+                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                                 batch_descriptor=None):
+            full_wrapper(input_1)
+
+        rt_mode, key = self.dispatcher.dispatch(desc_1)
+        # 1. Capture first shape
+        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode,
+                                            key)
+        assert action == "capture_global"
+
+        # 2. Replay first shape
+        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode,
+                                            key)
+        assert action == "replay"
+
+        rt_mode, key = self.dispatcher.dispatch(desc_2)
+        # 3. Capture second shape
+        action = self._run_and_monitor_call(full_wrapper, input_2, rt_mode,
+                                            key)
+        assert action == "capture_global"
+
+        # 4. Replay second shape
+        action = self._run_and_monitor_call(full_wrapper, input_2,
+                                            CUDAGraphMode.FULL, desc_2)
+        assert action == "replay"
+
+        # 5. Bypass if no key match
+        rt_mode, key = self.dispatcher.dispatch(desc_3_unseen)
+        assert rt_mode == CUDAGraphMode.NONE
+        action = self._run_and_monitor_call(full_wrapper, input_3, rt_mode,
+                                            key)
+        assert action == "bypass"
+
+        # capture unseen shape is not allowed after disable
+        set_cudagraph_capturing_enabled(False)
+        with pytest.raises(RuntimeError):
+            self._run_and_monitor_call(full_wrapper, input_3,
+                                       CUDAGraphMode.FULL, desc_3_unseen)
+        set_cudagraph_capturing_enabled(True)
+
+    @create_new_process_for_each_test("spawn")
+    def test_nested_wrappers(self):
+        """Tests a scenario with a PIECEWISE wrapper inside a FULL one."""
+        model = SimpleMLP().to("cuda")
+        full_wrapper = CUDAGraphWrapper(model, self.vllm_config,
+                                        CUDAGraphMode.FULL)
+        input_1 = torch.randn(1, 10, device="cuda")
+
+        # Setup: Inner model is wrapped with PIECEWISE, outer with FULL
+        inner_model = SimpleMLP().to("cuda")
+        piecewise_wrapper = CUDAGraphWrapper(inner_model, self.vllm_config,
+                                             CUDAGraphMode.PIECEWISE)
+        inner_model.forward = MagicMock(wraps=inner_model.forward)
+        outer_model = SimpleMLP().to("cuda")
+        # When outer model is called, it calls the piecewise_wrapper
+        outer_model.forward = MagicMock(wraps=outer_model.forward,
+                                        side_effect=piecewise_wrapper)
+        full_wrapper = CUDAGraphWrapper(outer_model, self.vllm_config,
+                                        CUDAGraphMode.FULL)
+
+        desc_1 = BatchDescriptor(num_tokens=1)
+
+        # 0. global warmup
+        with set_forward_context(attn_metadata=None,
+                                 vllm_config=self.vllm_config,
+                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                                 batch_descriptor=None):
+            full_wrapper(input_1)
+
+        # --- Test runtime mode FULL---
+        # Run with FULL mode context. Expect outer wrapper to capture.
+        # The inner mock should be called once inside the graph capture.
+        outer_model.forward.reset_mock()
+        inner_model.forward.reset_mock()
+        action = self._run_and_monitor_call(full_wrapper, input_1,
+                                            CUDAGraphMode.FULL, desc_1)
+        assert action == "capture_global"
+        assert outer_model.forward.call_count == 1
+        assert inner_model.forward.call_count == 1
+
+        # Run again. Expect outer wrapper to replay.
+        # The outer model should NOT be called because the whole graph
+        # is replayed.
+        action = self._run_and_monitor_call(full_wrapper, input_1,
+                                            CUDAGraphMode.FULL, desc_1)
+        assert action == "replay"
+        assert outer_model.forward.call_count == 1  # No new call
+        assert inner_model.forward.call_count == 1
+
+        # --- Test runtime mode PIECEWISE ---
+        outer_model.forward.reset_mock()
+        inner_model.forward.reset_mock()
+        # Run with PIECEWISE mode context.
+        # Expect outer wrapper to bypass and call inner wrapper.
+        # Inner wrapper should capture.
+        action = self._run_and_monitor_call(full_wrapper, input_1,
+                                            CUDAGraphMode.PIECEWISE, desc_1)
+        assert action == "capture_global"
+        assert outer_model.forward.call_count == 1
+        assert inner_model.forward.call_count == 1
+
+        # Run again with PIECEWISE.
+        # Outer bypasses, inner replays.
+        action = self._run_and_monitor_call(full_wrapper, input_1,
+                                            CUDAGraphMode.PIECEWISE, desc_1)
+        assert action == "bypass"
+        assert outer_model.forward.call_count == 2
+        assert inner_model.forward.call_count == 1
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
new file mode 100644
index 000000000000..81655e417500
--- /dev/null
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import os
+import weakref
+from contextlib import ExitStack
+from dataclasses import dataclass
+from typing import Optional
+
+import pytest
+
+from tests.utils import wait_for_gpu_memory_to_clear
+from vllm import LLM
+from vllm.config import CompilationConfig
+from vllm.platforms import current_platform
+
+
+@contextlib.contextmanager
+def temporary_environ(env_vars):
+    """
+    Temporarily set environment variables and restore them afterward.
+    We have to do this vs monkeypatch because monkeypatch doesn't work
+    with "module" scoped fixtures.
+    """
+    original_env = {k: os.environ.get(k) for k in env_vars}
+    try:
+        os.environ.update(env_vars)
+        yield
+    finally:
+        for k, v in original_env.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
+
+
+@dataclass
+class BackendConfig:
+    name: str
+    env_vars: dict
+    comp_config: dict
+    specific_gpu_arch: Optional[tuple] = None
+
+
+# Define all backend configurations of full cudagraph to be tested
+backend_configs = {
+    # FA3 on Hopper
+    "FA3":
+    BackendConfig(name="FA3",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # FlashMLA on Hopper
+    "FlashMLA":
+    BackendConfig(name="FlashMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # FA2
+    "FA2":
+    BackendConfig(name="FA2",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+    # Triton Attention
+    "TritonAttn":
+    BackendConfig(name="TritonAttn",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+    # FlashInfer
+    "FlashInfer":
+    BackendConfig(name="FlashInfer",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+}
+
+# test attention backend and cudagraph_mode combo
+# (backend_name, cudagraph_mode, supported)
+combo_cases_1 = [
+    ("FA3", "FULL", True),
+    ("FA3", "FULL_AND_PIECEWISE", True),
+    ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+    ("FA2", "FULL_AND_PIECEWISE", True),
+    ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+    ("FlashInfer", "FULL_AND_PIECEWISE", True),
+]
+
+
+@pytest.mark.parametrize("combo_case", combo_cases_1)
+def test_backend_and_cudagraph_mode_combo(combo_case):
+    backend_name, cudagraph_mode, supported = combo_case
+    if backend_name == "FlashInfer":
+        try:
+            import flashinfer  # noqa: F401
+        except ImportError:
+            pytest.skip("FlashInfer is not installed")
+    backend_config = backend_configs[backend_name]
+    # Dynamically skip test if GPU capability is not met
+    if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
+        != current_platform.get_device_capability():
+        pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
+
+    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+
+    with temporary_environ(env_vars), ExitStack() as stack:
+        if not supported:
+            stack.enter_context(pytest.raises(Exception))
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+                  max_num_seqs=256,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.45,
+                  max_model_len=1024,
+                  compilation_config=CompilationConfig(
+                      level=3, cudagraph_mode=cudagraph_mode))
+        llm.generate(["Hello, my name is"] * 10)
+
+    try:
+        llm = weakref.proxy(llm)
+        del llm
+    except UnboundLocalError:
+        pass
+
+    wait_for_gpu_memory_to_clear(
+        devices=[0],
+        threshold_ratio=0.1,
+    )
+
+
+# test cudagraph_mode with different compilation level.
+# (backend_name, cudagraph_mode, compilation_level, supported)
+combo_cases_2 = [
+    ("FA2", "FULL", 0, True),  # no compilation + full cudagraph
+    ("FA2", "FULL", 3, True),  # piecewise compilation + full cudagraph
+    ("FA2", "PIECEWISE", 0, False),  # no compilation + piecewise cudagraph
+    ("FA2", "PIECEWISE", 3,
+     True),  # piecewise compilation + piecewise cudagraph
+    ("FA2", "FULL_AND_PIECEWISE", 0,
+     False),  # piecewise cudagraph not supported without piecewise compilation
+    ("FA2", "FULL_AND_PIECEWISE", 3, True),
+    ("FA2", "FULL_DECODE_ONLY", 0, True),
+    ("FA2", "FULL_DECODE_ONLY", 3, True),
+    ("FA2", "NONE", 0, True),  # no compilation + no cudagraph
+    ("FA2", "NONE", 3, True),  # piecewise compilation + no cudagraph
+]
+
+
+@pytest.mark.parametrize("combo_case", combo_cases_2)
+def test_cudagraph_compilation_combo(combo_case):
+    backend_name, cudagraph_mode, compilation_level, supported\
+        = combo_case
+
+    env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
+
+    with temporary_environ(env_vars), ExitStack() as stack:
+        if not supported:
+            stack.enter_context(pytest.raises(Exception))
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+                  max_num_seqs=256,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.45,
+                  max_model_len=1024,
+                  compilation_config=CompilationConfig(
+                      level=compilation_level, cudagraph_mode=cudagraph_mode))
+        llm.generate(["Hello, my name is"] * 10)
+    try:
+        llm = weakref.proxy(llm)
+        del llm
+    except UnboundLocalError:
+        pass
+    finally:
+        wait_for_gpu_memory_to_clear(
+            devices=[0],
+            threshold_ratio=0.1,
+        )
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 673fb5866234..059e7a3b2976 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -15,7 +15,7 @@
 from torch._dispatch.python import enable_python_dispatcher
 
 import vllm.envs as envs
-from vllm.config import CompilationConfig, VllmConfig
+from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname
@@ -277,9 +277,6 @@ def split_graph(graph: fx.GraphModule,
     return split_gm, outputs
 
 
-# we share the global graph pool among all the backends
-global_graph_pool = None
-
 compilation_start_time = 0.0
 
 
@@ -339,14 +336,37 @@ def call_module(self, target: torch.fx.node.Target,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None)
+            # Lazy import here to avoid circular import
+            from .cuda_graph import CUDAGraphOptions
+            from .cuda_piecewise_backend import PiecewiseBackend
 
-            piecewise_backend = resolve_obj_by_qualname(
-                current_platform.get_piecewise_backend_cls())
-            self.module.__dict__[target] = piecewise_backend(
-                submod, self.vllm_config, self.graph_pool, index,
+            piecewise_backend = PiecewiseBackend(
+                submod, self.vllm_config, index,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_dynamic_shape, self.vllm_backend)
 
+            if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+                # resolve the static graph wrapper class (e.g. CUDAGraphWrapper
+                # class) as platform dependent.
+                static_graph_wrapper_class = resolve_obj_by_qualname(
+                    current_platform.get_static_graph_wrapper_cls())
+
+                # Always assign PIECEWISE runtime mode to the
+                # CUDAGraphWrapper for piecewise_backend, to distinguish
+                # it from the FULL cudagraph runtime mode, no matter it
+                # is wrapped on a full or piecewise fx graph.
+                self.module.__dict__[target] = static_graph_wrapper_class(
+                    runnable=piecewise_backend,
+                    vllm_config=self.vllm_config,
+                    runtime_mode=CUDAGraphMode.PIECEWISE,
+                    graph_pool=self.graph_pool,
+                    cudagraph_options=CUDAGraphOptions(
+                        debug_log_enable=piecewise_backend.is_first_graph,
+                        gc_disable=not piecewise_backend.is_first_graph,
+                        weak_ref_output=piecewise_backend.is_last_graph))
+            else:
+                self.module.__dict__[target] = piecewise_backend
+
             compilation_counter.num_piecewise_capturable_graphs_seen += 1
 
         return output
@@ -413,9 +433,7 @@ def __init__(
         # them, e.g. backbone (default), eagle_head, etc.
         self.prefix = prefix or model_tag
 
-        global global_graph_pool
-        if global_graph_pool is None:
-            global_graph_pool = current_platform.graph_pool_handle()
+        global_graph_pool = current_platform.get_global_graph_pool()
 
         # TODO: in the future, if we want to use multiple
         # streams, it might not be safe to share a global pool.
@@ -585,7 +603,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         self._called = True
 
-        if not self.compilation_config.use_cudagraph or \
+        if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE or \
             not self.compilation_config.cudagraph_copy_inputs:
             return self.split_gm
 
diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py
deleted file mode 100644
index 4d7aeeb4d03e..000000000000
--- a/vllm/compilation/base_piecewise_backend.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Any, Callable, Protocol
-
-import torch.fx as fx
-
-from vllm.compilation.backends import VllmBackend
-from vllm.config import VllmConfig
-
-
-class AbstractPiecewiseBackend(Protocol):
-    """
-    PiecewiseBackend interface that allows platforms to extend 
-    piecewise static graph.
-    """
-
-    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
-                 graph_pool: Any, piecewise_compile_index: int,
-                 total_piecewise_compiles: int, sym_shape_indices: list[int],
-                 compiled_graph_for_general_shape: Callable,
-                 vllm_backend: VllmBackend, **kwargs):
-        """
-        Initializes the PiecewiseBackend class with compilation and 
-        execution-related configurations.
-
-        This class handles piecewise compilation, graph capturing, 
-        and dispatching for specific input shapes.
-
-        Args:
-            graph (fx.GraphModule): The graph represented in fx.
-            vllm_config (VllmConfig): Global configuration for vLLM.
-            graph_pool (Any): 
-                Graph memory pool handle, e.g., 
-                    `torch.cuda.graph_pool_handle()`.
-            piecewise_compile_index (int): 
-                Index of the current piecewise subgraph.
-            total_piecewise_compiles (int): 
-                Total number of piecewise-compiled graphs.
-            sym_shape_indices (list[int]): 
-                Indices of symbolic shape.
-            compiled_graph_for_general_shape (Callable): 
-                Callable that executes the graph compiled for general shapes.
-            vllm_backend (VllmBackend): 
-                Backend compiler that manages compilation and graph runtime 
-                for vLLM.
-
-        Keyword Args:
-            kwargs: Additional keyword arguments reserved for future 
-                extensions or custom platforms.
-        """
-        raise NotImplementedError
-
-    def __call__(self, *args) -> Any:
-        """Executes the compiled graph for given input args.
-
-        If this is the first invocation, executes the general compiled graph
-        and initiates the compilation process tracking. For subsequent calls,
-        dynamically dispatches execution to either a compiled graph or a static
-        graph based on the input shape.
-
-        Args:
-            *args: Variable length input arguments to be passed into the 
-                graph. The symbolic shape is expected to be in position 
-                `sym_shape_indices[0]`.
-
-        Returns:
-            Any: Output of the executed graph. This can be from the general
-            compiled graph, a specialized compiled version for the given shape,
-            or a replayed static graph.
-        """
-        raise NotImplementedError
diff --git a/vllm/compilation/base_static_graph.py b/vllm/compilation/base_static_graph.py
new file mode 100644
index 000000000000..1c3f52c533b1
--- /dev/null
+++ b/vllm/compilation/base_static_graph.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Callable, Protocol
+
+from vllm.config import CUDAGraphMode, VllmConfig
+
+
+class AbstractStaticGraphWrapper(Protocol):
+    """
+    StaticGraphWrapper interface that allows platforms to wrap a callable
+    to be captured as a static graph.
+    """
+
+    def __init__(self, runnable: Callable, vllm_config: VllmConfig,
+                 runtime_mode: CUDAGraphMode, graph_pool: Any, **kwargs):
+        """
+        Initializes the StaticGraphWrapper class with graph capturing and
+        execution-related configurations.
+
+        Args:
+            runnable (Callable): The callable to be wrapped and captured.
+            vllm_config (VllmConfig): Global configuration for vLLM.
+            runtime_mode (CUDAGraphMode): The style of the static
+                graph runtime. See CUDAGraphMode in vllm/config.py.
+                Note that only the subset enum `NONE`, `PIECEWISE` and `FULL`
+                are used as concrete runtime mode for cudagraph dispatching.
+            graph_pool (Any):
+                Graph memory pool handle, e.g.,
+                    `torch.cuda.graph_pool_handle()`.
+        Keyword Args:
+            kwargs: Additional keyword arguments for platform-specific
+                configurations.
+        """
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs) -> Any:
+        """
+        Executes the wrapped callable.
+
+        If the current runtime mode in the ForwardContext matches the runtime
+        mode of this instance, it replays the CUDAGraph or captures it using
+        the callable if it hasn't been captured yet. Otherwise, it calls the
+        original callable directly.
+
+        Args:
+            *args: Variable length input arguments to be passed into the
+                callable.
+            **kwargs: Keyword arguments to be passed into the callable.
+
+        Returns:
+            Any: Output of the executed callable.
+        """
+        raise NotImplementedError
diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
new file mode 100644
index 000000000000..65a38197ad4e
--- /dev/null
+++ b/vllm/compilation/cuda_graph.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import dataclasses
+from contextlib import ExitStack
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.monitor import validate_cudagraph_capturing_enabled
+from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.forward_context import BatchDescriptor, get_forward_context
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import weak_ref_tensors
+
+logger = init_logger(__name__)
+
+
+@dataclasses.dataclass
+class CUDAGraphEntry:
+    batch_descriptor: BatchDescriptor
+    cudagraph: Optional[torch.cuda.CUDAGraph] = None
+    output: Optional[Any] = None
+
+    # for cudagraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: Optional[list[int]] = None
+
+
+@dataclasses.dataclass
+class CUDAGraphOptions:
+    debug_log_enable: bool = True
+    gc_disable: bool = False
+    weak_ref_output: bool = True
+
+
+class CUDAGraphWrapper:
+    """Wraps a runnable to add CUDA graph capturing and replaying ability. And
+    provide attribute access to the underlying `runnable` via `__getattr__`.
+
+    The workflow of this wrapper in the cudagraph dispatching is as follows:
+    1. At initialization, a runtime mode is assigned to the wrapper (FULL or
+    PIECEWISE). 
+    2. At runtime, the wrapper receives a runtime_mode and a 
+    batch_descriptor(key) from the forward context and blindly trust them
+    for cudagraph dispatching. 
+    3. If runtime_mode is NONE or runtime_mode does not match the mode of the
+    wrapper, just call the runnable directly.
+    4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper,
+    the wrapper will perform cudagraph capture(if key does not exist, create
+    a new entry and cache it) or replay (if key exists in the cache).
+
+    Note: CUDAGraphWrapper does not store persistent buffers or copy any
+    runtime inputs into that buffers for replay. We assume implementing them
+    is done outside of the wrapper. That is because we do not make any 
+    assumption on the dynamic shape (batch size) of the runtime inputs, as a
+    trade-off for staying orthogonal to compilation logic. Nevertheless, 
+    tracing and checking the input addresses to be consistent during replay is
+    guaranteed when VLLM_LOGGING_LEVEL == "DEBUG".
+    """
+
+    def __init__(self,
+                 runnable: Callable,
+                 vllm_config: VllmConfig,
+                 runtime_mode: CUDAGraphMode,
+                 graph_pool: Any = None,
+                 cudagraph_options: Optional[CUDAGraphOptions] = None):
+        self.runnable = runnable
+        self.vllm_config = vllm_config
+        self.graph_pool = graph_pool
+        self.runtime_mode = runtime_mode
+        self.compilation_config = vllm_config.compilation_config
+
+        self.first_run_finished = False
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
+        # assert runtime_mode is not NONE(no cudagraph), otherwise, we don't
+        # need to initialize a CUDAGraphWrapper.
+        assert self.runtime_mode != CUDAGraphMode.NONE
+        if self.graph_pool is None:
+            self.graph_pool = current_platform.get_global_graph_pool()
+
+        if cudagraph_options is None:
+            cudagraph_options = CUDAGraphOptions()
+        self.cudagraph_options = cudagraph_options
+        # the entries for different batch descriptors that we need to capture
+        # cudagraphs for.
+        self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry]\
+                                                                        = {}
+
+    def __getattr__(self, key: str):
+        # allow accessing the attributes of the runnable.
+        if hasattr(self.runnable, key):
+            return getattr(self.runnable, key)
+        raise AttributeError(f"Attribute {key} not exists in the runnable of "
+                             f"cudagraph wrapper: {self.runnable}")
+
+    def unwrap(self) -> Callable:
+        # in case we need to access the original runnable.
+        return self.runnable
+
+    def __call__(self, *args, **kwargs):
+        forward_context = get_forward_context()
+        batch_descriptor = forward_context.batch_descriptor
+        cudagraph_runtime_mode = forward_context.cudagraph_runtime_mode
+
+        if cudagraph_runtime_mode == CUDAGraphMode.NONE or \
+                            cudagraph_runtime_mode != self.runtime_mode:
+            # CUDAGraphMode.NONE could mean the profile run, a warmup run, or
+            # running without cudagraphs.
+            # We do not trigger capture/replay if the runtime mode is not
+            # matches. This enables properly dispatching to the correct
+            # CUDAGraphWrapper when nesting multiple instances with different
+            # runtime modes.
+            return self.runnable(*args, **kwargs)
+
+        if batch_descriptor not in self.concrete_cudagraph_entries:
+            # create a new entry for this batch descriptor
+            self.concrete_cudagraph_entries[batch_descriptor] = \
+                CUDAGraphEntry(batch_descriptor=batch_descriptor)
+
+        entry = self.concrete_cudagraph_entries[batch_descriptor]
+
+        if entry.cudagraph is None:
+            if self.cudagraph_options.debug_log_enable:
+                # Since we capture cudagraph for many different shapes and
+                # capturing is fast, we don't need to log it for every
+                # shape. E.g. we only log it for the first subgraph in
+                # piecewise mode.
+                logger.debug("Capturing a cudagraph on (%s,%s)",
+                             self.runtime_mode.name, entry.batch_descriptor)
+            # validate that cudagraph capturing is legal at this point.
+            validate_cudagraph_capturing_enabled()
+
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
+            cudagraph = torch.cuda.CUDAGraph()
+
+            with ExitStack() as stack:
+                if self.cudagraph_options.gc_disable:
+                    # during every model forward for piecewise cudagraph
+                    # mode, we will capture many pieces of cudagraphs
+                    # (roughly one per layer). running gc again and again
+                    # across layers will make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(
+                        patch("torch.cuda.empty_cache", lambda: None))
+
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = self.runnable(*args, **kwargs)
+                    if self.cudagraph_options.weak_ref_output:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph in piecewise cuadgraph mode, because
+                        # the output of the last graph will not be used by
+                        # any other cuda graph.
+                        output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                f"Input addresses for cudagraphs are different "
+                f"during replay. Expected {entry.input_addresses}, "
+                f"got {new_input_addresses}")
+
+        entry.cudagraph.replay()
+        return entry.output
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 8c49ea6cc107..ae26e9f1bf2b 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -2,21 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
-from contextlib import ExitStack
-from typing import Any, Callable, Optional
-from unittest.mock import patch
+from typing import Any, Callable
 
-import torch
 import torch.fx as fx
 
 import vllm.envs as envs
 from vllm.compilation.backends import VllmBackend
-from vllm.compilation.counter import compilation_counter
 from vllm.compilation.monitor import end_monitoring_torch_compile
 from vllm.config import VllmConfig
-from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-from vllm.utils import weak_ref_tensors
 
 logger = init_logger(__name__)
 
@@ -24,44 +18,29 @@
 @dataclasses.dataclass
 class ConcreteSizeEntry:
     runtime_shape: int
-    need_to_compile: bool  # the size is in compile_sizes
-    use_cudagraph: bool  # the size is in cudagraph_capture_sizes
-
     compiled: bool = False
     runnable: Callable = None  # type: ignore
-    num_finished_warmup: int = 0
-    cudagraph: Optional[torch.cuda.CUDAGraph] = None
-    output: Optional[Any] = None
-
-    # for cudagraph debugging, track the input addresses
-    # during capture, and check if they are the same during replay
-    input_addresses: Optional[list[int]] = None
 
 
-class CUDAPiecewiseBackend:
+class PiecewiseBackend:
 
     def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
-                 graph_pool: Any, piecewise_compile_index: int,
-                 total_piecewise_compiles: int, sym_shape_indices: list[int],
+                 piecewise_compile_index: int, total_piecewise_compiles: int,
+                 sym_shape_indices: list[int],
                  compiled_graph_for_general_shape: Callable,
                  vllm_backend: VllmBackend):
         """
         The backend for piecewise compilation.
-        It mainly handles the compilation and cudagraph capturing.
+        It mainly handles the compilation of static shapes and 
+        dispatching based on runtime shape.
 
         We will compile `self.graph` once for the general shape,
         and then compile for different shapes specified in
         `compilation_config.compile_sizes`.
-
-        Independently, we will capture cudagraph for different shapes.
-
-        If a shape needs both compilation and cudagraph, we will
-        compile it first, and then capture cudagraph.
         """
         self.graph = graph
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
-        self.graph_pool = graph_pool
         self.piecewise_compile_index = piecewise_compile_index
         self.total_piecewise_compiles = total_piecewise_compiles
         self.vllm_backend = vllm_backend
@@ -70,11 +49,10 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         self.is_last_graph = (
             piecewise_compile_index == total_piecewise_compiles - 1)
 
+        self.is_full_graph = total_piecewise_compiles == 1
+
         self.compile_sizes: set[int] = set(
             self.compilation_config.compile_sizes)
-        self.cudagraph_capture_sizes: set[int] = set(
-            self.compilation_config.cudagraph_capture_sizes
-        ) if self.compilation_config.use_cudagraph else set()
 
         self.first_run_finished = False
 
@@ -84,18 +62,18 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
 
         self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
 
-        # the entries for different shapes that we need to either
-        # compile or capture cudagraph
+        # the entries for different shapes that we need to compile
         self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
 
         # to_be_compiled_sizes tracks the remaining sizes to compile,
         # and updates during the compilation process, so we need to copy it
         self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
-        for shape in self.compile_sizes.union(self.cudagraph_capture_sizes):
+
+        # We only keep compilation management inside this class directly.
+        for shape in self.compile_sizes:
             self.concrete_size_entries[shape] = ConcreteSizeEntry(
                 runtime_shape=shape,
-                need_to_compile=shape in self.compile_sizes,
-                use_cudagraph=shape in self.cudagraph_capture_sizes,
+                runnable=self.compiled_graph_for_general_shape,
             )
 
     def check_for_ending_compilation(self):
@@ -112,16 +90,14 @@ def __call__(self, *args) -> Any:
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
+
         if runtime_shape not in self.concrete_size_entries:
             # we don't need to do anything for this shape
             return self.compiled_graph_for_general_shape(*args)
 
         entry = self.concrete_size_entries[runtime_shape]
 
-        if entry.runnable is None:
-            entry.runnable = self.compiled_graph_for_general_shape
-
-        if entry.need_to_compile and not entry.compiled:
+        if not entry.compiled:
             entry.compiled = True
             self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
@@ -138,81 +114,4 @@ def __call__(self, *args) -> Any:
             if self.is_last_graph and not self.to_be_compiled_sizes:
                 self.check_for_ending_compilation()
 
-        # Skip CUDA graphs if this entry doesn't use them OR
-        # if we're supposed to skip them globally
-        skip_cuda_graphs = get_forward_context().skip_cuda_graphs
-        if not entry.use_cudagraph or skip_cuda_graphs:
-            return entry.runnable(*args)
-
-        if entry.cudagraph is None:
-            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
-                entry.num_finished_warmup += 1
-                if self.is_first_graph:
-                    logger.debug(
-                        "Warming up %s/%s for shape %s",
-                        entry.num_finished_warmup,
-                        self.compilation_config.cudagraph_num_of_warmups,
-                        runtime_shape)
-                return entry.runnable(*args)
-
-            if self.is_first_graph:
-                # Since we capture cudagraph for many different shapes and
-                # capturing is fast, we don't need to log it for every shape.
-                # We only log it in the debug mode.
-                logger.debug("Capturing a cudagraph for shape %s",
-                             runtime_shape)
-
-            input_addresses = [
-                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
-            ]
-            entry.input_addresses = input_addresses
-            cudagraph = torch.cuda.CUDAGraph()
-
-            with ExitStack() as stack:
-                if not self.is_first_graph:
-                    # during every model forward, we will capture
-                    # many pieces of cudagraphs (roughly one per layer).
-                    # running gc again and again across layers will
-                    # make the cudagraph capture very slow.
-                    # therefore, we only run gc for the first graph,
-                    # and disable gc for the rest of the graphs.
-                    stack.enter_context(patch("gc.collect", lambda: None))
-                    stack.enter_context(
-                        patch("torch.cuda.empty_cache", lambda: None))
-
-                # mind-exploding: carefully manage the reference and memory.
-                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
-                    # `output` is managed by pytorch's cudagraph pool
-                    output = entry.runnable(*args)
-                    if self.is_last_graph:
-                        # by converting it to weak ref,
-                        # the original `output` will immediately be released
-                        # to save memory. It is only safe to do this for
-                        # the last graph, because the output of the last graph
-                        # will not be used by any other cuda graph.
-                        output = weak_ref_tensors(output)
-
-            # here we always use weak ref for the output
-            # to save memory
-            entry.output = weak_ref_tensors(output)
-            entry.cudagraph = cudagraph
-
-            compilation_counter.num_cudagraph_captured += 1
-
-            # important: we need to return the output, rather than
-            # the weak ref of the output, so that pytorch can correctly
-            # manage the memory during cuda graph capture
-            return output
-
-        if self.is_debugging_mode:
-            # check if the input addresses are the same
-            new_input_addresses = [
-                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
-            ]
-            assert new_input_addresses == entry.input_addresses, (
-                "Input addresses for cudagraphs are different during replay."
-                f" Expected {entry.input_addresses}, got {new_input_addresses}"
-            )
-
-        entry.cudagraph.replay()
-        return entry.output
+        return entry.runnable(*args)
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 1e059b59fb64..9047bf3cbf8e 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -37,3 +37,21 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig):
         if context_manager is not None:
             context_manager.__exit__(None, None, None)
             context_manager = None
+
+
+cudagraph_capturing_enabled: bool = True
+
+
+def validate_cudagraph_capturing_enabled():
+    # used to monitor whether an cudagraph capturing is legal at runtime.
+    # should be called before any cudagraph capturing.
+    # if an illegal cudagraph capturing happens, raise an error.
+    global cudagraph_capturing_enabled
+    if not cudagraph_capturing_enabled:
+        raise RuntimeError("CUDA graph capturing detected at an inappropriate "
+                           "time. This operation is currently disabled.")
+
+
+def set_cudagraph_capturing_enabled(enabled: bool):
+    global cudagraph_capturing_enabled
+    cudagraph_capturing_enabled = enabled
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 8d5df1061eda..96d4eae2ee9a 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -11,7 +11,8 @@
 import torch
 
 import vllm.envs as envs
-from vllm.config import CompilationLevel, get_current_vllm_config
+from vllm.config import (CompilationLevel, CUDAGraphMode,
+                         get_current_vllm_config)
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -115,8 +116,8 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
                 except Exception:
                     pass
 
-        if self.vllm_config.compilation_config.use_cudagraph and \
-            "update" in new_code.co_names:
+        if self.vllm_config.compilation_config.cudagraph_mode != \
+            CUDAGraphMode.NONE and "update" in new_code.co_names:
             import depyf
             src = depyf.decompile(new_code)
             msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src  # noqa
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 82ef8db673fe..280ae60c91ff 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -32,7 +32,7 @@
 from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
                                PrefixCachingHashAlgo)
 from vllm.config.compilation import (CompilationConfig, CompilationLevel,
-                                     PassConfig)
+                                     CUDAGraphMode, PassConfig)
 from vllm.config.parallel import DistributedExecutorBackend, ParallelConfig
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
 from vllm.config.utils import ConfigType, config
@@ -3529,11 +3529,21 @@ def __post_init__(self):
                 else:
                     self.compilation_config.level = \
                             CompilationLevel.NO_COMPILATION
+
             else:
                 # NB: Passing both --enforce-eager and a compilation level
                 # in V0 means the compilation level wins out.
                 self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
+        # if cudagraph_mode is not explicitly set by users, set default value
+        if self.compilation_config.cudagraph_mode is None:
+            if envs.VLLM_USE_V1 and self.compilation_config.level \
+                == CompilationLevel.PIECEWISE:
+                self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.PIECEWISE
+            else:
+                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
@@ -3541,12 +3551,13 @@ def __post_init__(self):
                 True
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             self.compilation_config.custom_ops.append("+rms_norm")
-        if envs.VLLM_USE_V1 and self.model_config is not None and \
-            not self.model_config.enforce_eager:
-            # By default, V1 uses piecewise CUDA graphs. If full_cuda_graph
-            # is set to True, full CUDA graphs will be used.
+
+        # disable cudagraph when enforce eager execution
+        if self.model_config is not None and self.model_config.enforce_eager:
+            logger.info("Cudagraph is disabled under eager mode")
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+        elif envs.VLLM_USE_V1:
             self.compilation_config.cudagraph_num_of_warmups = 1
-            self.compilation_config.set_splitting_ops_for_v1()
 
         self._set_cudagraph_sizes()
 
@@ -3566,12 +3577,6 @@ def __post_init__(self):
                 "Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
-        if self.compilation_config.full_cuda_graph and \
-            not self.model_config.disable_cascade_attn:
-            logger.info("full_cuda_graph is not supported with "
-                        "cascade attention. Disabling cascade attention.")
-            self.model_config.disable_cascade_attn = True
-
         disable_chunked_prefill_reasons: list[str] = []
 
         if self.model_config and self.model_config.pooler_config:
@@ -3612,9 +3617,32 @@ def __post_init__(self):
                            "to True to enable.")
         current_platform.check_and_update_config(self)
 
+        # final check of cudagraph mode after platform-specific update
+        if envs.VLLM_USE_V1:
+            if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
+                and self.model_config is not None and \
+                not self.model_config.disable_cascade_attn:
+                logger.info("CUDAGraphMode.FULL is not supported with "
+                            "cascade attention currently. Disabling cascade"
+                            "attention.")
+                self.model_config.disable_cascade_attn = True
+
+            if self.compilation_config.cudagraph_mode\
+                .requires_piecewise_compilation():
+                assert self.compilation_config.level == \
+                    CompilationLevel.PIECEWISE, \
+                    "Compilation level should be CompilationLevel.PIECEWISE "\
+                    "when cudagraph_mode piecewise cudagraphs is used, "\
+                    f"cudagraph_mode={self.compilation_config.cudagraph_mode}"
+
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
+        # Do this after all the updates to compilation_config.level
+        if envs.VLLM_USE_V1 and \
+            self.compilation_config.level == CompilationLevel.PIECEWISE:
+            self.compilation_config.set_splitting_ops_for_v1()
+
         if (envs.VLLM_USE_V1
                 and not self.scheduler_config.disable_hybrid_kv_cache_manager):
             # logger should only print warning message for hybrid models. As we
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 8a78d811b9a2..56a2183f8e2c 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import enum
 import hashlib
 from collections import Counter
 from dataclasses import asdict, field
-from typing import TYPE_CHECKING, Any, Callable, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
 
-from pydantic import TypeAdapter
+from pydantic import TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
@@ -31,6 +32,40 @@ class CompilationLevel:
     PIECEWISE = 3
 
 
+class CUDAGraphMode(enum.Enum):
+    """ Constants for the cudagraph mode in CompilationConfig.
+    Meanwhile, the subset enum `NONE`, `PIECEWISE` and `FULL` are also
+    treated as concrete runtime mode for cudagraph runtime dispatching.
+    """
+    NONE = 0
+    PIECEWISE = 1
+    FULL = 2
+    FULL_DECODE_ONLY = (FULL, NONE)
+    FULL_AND_PIECEWISE = (FULL, PIECEWISE)
+
+    def decode_mode(self) -> 'CUDAGraphMode':
+        return CUDAGraphMode(self.value[0]) if \
+            self.separate_routine() else self
+
+    def mixed_mode(self) -> 'CUDAGraphMode':
+        return CUDAGraphMode(self.value[1]) if \
+            self.separate_routine() else self
+
+    def requires_piecewise_compilation(self) -> bool:
+        return (self.decode_mode() == CUDAGraphMode.PIECEWISE
+                or self.mixed_mode() == CUDAGraphMode.PIECEWISE)
+
+    def max_cudagraph_mode(self) -> 'CUDAGraphMode':
+        return CUDAGraphMode(max(
+            self.value)) if self.separate_routine() else self
+
+    def has_full_cudagraphs(self) -> bool:
+        return self.max_cudagraph_mode() == CUDAGraphMode.FULL
+
+    def separate_routine(self) -> bool:
+        return isinstance(self.value, tuple)
+
+
 @config
 @dataclass
 class PassConfig:
@@ -91,6 +126,7 @@ class CompilationConfig:
         - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
     - CudaGraph capture:
         - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
+        - [`cudagraph_mode`][vllm.config.CompilationConfig.cudagraph_mode]
         - [`cudagraph_capture_sizes`]
         [vllm.config.CompilationConfig.cudagraph_capture_sizes]
         - [`cudagraph_num_of_warmups`]
@@ -157,7 +193,7 @@ class CompilationConfig:
     By default, all custom ops are enabled when running without Inductor and
     disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
     Inductor generates (fused) Triton kernels for disabled custom ops."""
-    splitting_ops: list[str] = field(default_factory=list)
+    splitting_ops: Optional[list[str]] = None
     """A list of ops to split the full graph into subgraphs, used in piecewise
     compilation."""
 
@@ -187,7 +223,43 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    use_cudagraph: bool = field(default_factory=lambda: envs.VLLM_USE_V1)
+    cudagraph_mode: Optional[CUDAGraphMode] = None
+    """
+    The mode of the cudagraph.
+    - NONE, no cudagraph capture.
+    - PIECEWISE. (v1 default)
+    - FULL.
+    - FULL_DECODE_ONLY.
+    - FULL_AND_PIECEWISE.
+
+    PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
+    incompatiable ops (i.e. some attention ops) outside the cudagraph
+    for general flexibility.
+    This is the default mode.
+
+    FULL mode: Capture full cudagraph for all batches. Can be good for small
+    models or workloads with small prompts; not supported by many backends.
+    Generally for performance FULL_AND_PIECEWISE is better.
+    
+    FULL_DECODE_ONLY mode: Capture full cudagraph for decode batches only.
+    Mixed prefill-decode batches are run without cudagraphs. Can be good for
+    decode instances in a P/D setup where prefill is not as important so we
+    can save some memory.
+    
+    FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
+    piecewise cudagraph for prefill and mixed prefill-decode batches.
+    This is like the most performant mode for most models.
+
+    Currently, the cudagraph mode is only used for the v1 engine.
+    Note that the cudagraph logic is generally orthogonal to the 
+    compilation logic. While piecewise cudagraphs require piecewise 
+    compilation (level=PIECEWISE and non-empty splitting_ops), full
+    cudagraphs are supported with and without compilation.
+    
+    Warning: This flag is new and subject to change in addition 
+    more modes may be added.
+    """
+    use_cudagraph: bool = True
     """Whether to use cudagraph inside compilation.
     - False: cudagraph inside compilation is not used.
     - True: cudagraph inside compilation is used. It requires
@@ -197,8 +269,9 @@ class CompilationConfig:
     CompilationLevel.PIECEWISE (aka -O3).
     Note that this is orthogonal to the cudagraph capture logic
     outside of compilation.
-    TODO: move outside cudagraph logic into compilation.
-    torch.compile will handle cudagraph capture logic in the future."""
+    Warning: This flag is deprecated and will be removed in the next major or
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
+    """
     cudagraph_num_of_warmups: int = 0
     """Number of warmup runs for cudagraph.
     It means the first several runs will be treated as warmup runs.
@@ -213,12 +286,17 @@ class CompilationConfig:
     cudagraph. If the caller can guarantee that the same input buffers
     are always used, it can set this to False. Otherwise, it should
     set this to True, and the compiler will copy the input to an
-    internally managed buffer. Default is False."""
-    full_cuda_graph: bool = False
+    internally managed buffer. Default is False. 
+    Note that this flag is only effective when cudagraph_mode is PIECEWISE.
+    """
+    full_cuda_graph: Optional[bool] = False
     """whether to use a full cuda graph for the entire forward pass rather than
     splitting certain operations such as attention into subgraphs. Thus this
     flag cannot be used together with splitting_ops. This may provide
-    performance benefits for smaller models."""
+    performance benefits for smaller models.
+    Warning: This flag is deprecated and will be removed in the next major or
+    minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead.
+    """
 
     pass_config: PassConfig = field(default_factory=PassConfig)
     """Custom inductor passes, see PassConfig for more details"""
@@ -253,6 +331,13 @@ class CompilationConfig:
     Map from layer name to layer objects that need to be accessed outside
     model code, e.g., Attention, FusedMOE when dp_size>1."""
 
+    # Attention ops; used for piecewise cudagraphs
+    _attention_ops: ClassVar[list[str]] = [
+        "vllm.unified_attention",
+        "vllm.unified_attention_with_output",
+        "vllm.mamba_mixer2",
+    ]
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -297,13 +382,26 @@ def __repr__(self) -> str:
         if pass_config_exclude:
             exclude["pass_config"] = pass_config_exclude
 
-        return TypeAdapter(CompilationConfig).dump_json(
-            self,
-            exclude=exclude,  # type: ignore[arg-type]
-            exclude_unset=True).decode()
+        # The cast to string is necessary because Pydantic is mocked in docs
+        # builds and sphinx-argparse doesn't know the return type of decode()
+        return str(
+            TypeAdapter(CompilationConfig).dump_json(
+                self,
+                exclude=exclude,  # type: ignore[arg-type]
+                exclude_unset=True).decode())
 
     __str__ = __repr__
 
+    @field_validator("cudagraph_mode", mode="before")
+    @classmethod
+    def validate_cudagraph_mode_before(cls, value: Any) -> Any:
+        """
+        enable parse the `cudagraph_mode` enum type from string
+        """
+        if isinstance(value, str):
+            return CUDAGraphMode[value.upper()]
+        return value
+
     def __post_init__(self) -> None:
         count_none = self.custom_ops.count("none")
         count_all = self.custom_ops.count("all")
@@ -341,7 +439,26 @@ def __post_init__(self) -> None:
         if isinstance(self.pass_config, dict):
             self.pass_config = PassConfig(**self.pass_config)
 
-    def init_backend(self, vllm_config: VllmConfig) -> Union[str, Callable]:
+        # migrate the deprecated flags
+        if not self.use_cudagraph:
+            logger.warning("use_cudagraph is deprecated, use "
+                           "cudagraph_mode=NONE instead.")
+            if self.cudagraph_mode is not None:
+                raise ValueError(
+                    "use_cudagraph and cudagraph_mode are mutually"
+                    " exclusive, prefer cudagraph_mode since "
+                    "use_cudagraph is deprecated.")
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        if self.full_cuda_graph:
+            logger.warning("full_cuda_graph is deprecated, use "
+                           "cudagraph_mode=FULL instead.")
+            if self.cudagraph_mode is not None:
+                raise ValueError("full_cuda_graph and cudagraph_mode are "
+                                 "mutually exclusive, prefer cudagraph_mode "
+                                 "since full_cuda_graph is deprecated.")
+            self.cudagraph_mode = CUDAGraphMode.FULL
+
+    def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         if self.level == CompilationLevel.NO_COMPILATION:
             raise ValueError("No compilation level is set.")
 
@@ -414,15 +531,34 @@ def init_with_cudagraph_sizes(self,
             self.max_capture_size] = self.max_capture_size
 
     def set_splitting_ops_for_v1(self):
-        # NOTE: this function needs to be called
-        if self.splitting_ops and self.full_cuda_graph:
-            raise ValueError("full_cuda_graph cannot be used together with "
-                             "splitting_ops, as Full CUDA graph will override "
-                             f"the splitting_ops: {self.splitting_ops}")
-
-        if not self.splitting_ops:
-            self.splitting_ops = [] if self.full_cuda_graph else [
-                "vllm.unified_attention",
-                "vllm.unified_attention_with_output",
-                "vllm.mamba_mixer2",
-            ]
+        # NOTE: this function needs to be called only when level is
+        # CompilationLevel.PIECEWISE
+        assert self.level == CompilationLevel.PIECEWISE, (
+            "set_splitting_ops_for_v1 should only be called when "
+            "level is CompilationLevel.PIECEWISE")
+
+        if self.splitting_ops is None:
+            # NOTE: When using full cudagraph, instead of setting an empty
+            # list and capture the full cudagraph inside the flattened fx
+            # graph, we keep the piecewise fx graph structure but capture the
+            # full cudagraph outside the fx graph. This reduces some cpu
+            # overhead when the runtime batch_size is not cudagraph captured.
+            # see https://github.com/vllm-project/vllm/pull/20059 for details.
+            self.splitting_ops = self._attention_ops
+        elif len(self.splitting_ops) == 0:
+            logger.warning_once("Using piecewise compilation with empty "
+                                "splitting_ops.")
+            if self.cudagraph_mode == CUDAGraphMode.PIECEWISE:
+                logger.warning_once(
+                    "When compilation level is piecewise with empty "
+                    "splitting_ops, PIECEWISE cudagraph_mode will be "
+                    "treated as FULL cudagraph_mode. Please ensure you are "
+                    "using attention backends that support cudagraph or set "
+                    "cudagraph_mode to NONE explicitly if encountering "
+                    "any problems.")
+                self.cudagraph_mode = CUDAGraphMode.FULL
+            self.splitting_ops = []
+
+    def splitting_ops_contain_attention(self) -> bool:
+        return self.splitting_ops is not None and all(
+            op in self.splitting_ops for op in self._attention_ops)
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 4686ba24e65f..c57c51d289ac 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,13 +5,13 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
 import torch
 import torch.distributed as dist
 
 import vllm.envs as envs
-from vllm.config import ParallelConfig, VllmConfig
+from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -26,6 +26,27 @@
 batchsize_forward_time: defaultdict = defaultdict(list)
 
 
+class BatchDescriptor(NamedTuple):
+    """
+    Batch descriptor for cudagraph dispatching. We should keep the num of
+    items as minimal as possible to properly and uniquely describe the padded
+    batch for cudagraph.
+    """
+    num_tokens: int
+    uniform_decode: bool = False
+    """
+    False can also be used for an uniform decode batch to dispatch to the 
+    cudagraph supporting non-uniform batches.
+    """
+
+    @property
+    def non_uniform(self) -> "BatchDescriptor":
+        """
+        Return a non-uniform version of current batch descriptor.
+        """
+        return BatchDescriptor(self.num_tokens, uniform_decode=False)
+
+
 def _compute_chunked_local_num_tokens(num_tokens_across_dp_cpu: list[int],
                                       max_num_tokens: int,
                                       chunk_idx: int) -> list[int]:
@@ -152,7 +173,15 @@ class ForwardContext:
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
     dp_metadata: Optional[DPMetadata] = None
-    skip_cuda_graphs: bool = False
+    # determine the cudagraph style at runtime to be FULL, PIECEWISE, or NONE.
+    # by default NONE, no cudagraph is used.
+    cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE
+    batch_descriptor: Optional[BatchDescriptor] = None
+
+    def __post_init__(self):
+        assert self.cudagraph_runtime_mode in [
+            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
+            f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}"
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -168,13 +197,13 @@ def get_forward_context() -> ForwardContext:
 
 @contextmanager
 def set_forward_context(
-    attn_metadata: Any,
-    vllm_config: VllmConfig,
-    virtual_engine: int = 0,
-    num_tokens: Optional[int] = None,
-    num_tokens_across_dp: Optional[torch.Tensor] = None,
-    skip_cuda_graphs: bool = False,
-):
+        attn_metadata: Any,
+        vllm_config: VllmConfig,
+        virtual_engine: int = 0,
+        num_tokens: Optional[int] = None,
+        num_tokens_across_dp: Optional[torch.Tensor] = None,
+        cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+        batch_descriptor: Optional[BatchDescriptor] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -198,7 +227,8 @@ def set_forward_context(
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
         dp_metadata=dp_metadata,
-        skip_cuda_graphs=skip_cuda_graphs,
+        cudagraph_runtime_mode=cudagraph_runtime_mode,
+        batch_descriptor=batch_descriptor,
     )
 
     try:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 483d5e1531a9..321db8287c0f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -177,17 +177,20 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 logger.info("Forcing kv cache block size to 128 for "
                             "CUTLASS_MLA backend.")
 
+        # lazy import to avoid circular import
+        from vllm.config import CUDAGraphMode
+
         compilation_config = vllm_config.compilation_config
         if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
                 and parallel_config.data_parallel_size > 1
-                and compilation_config.use_cudagraph):
+                and compilation_config.cudagraph_mode != CUDAGraphMode.NONE):
             logger.info(
-                "Data Parallel: Forcing enforce eager to be True since DP "
+                "Data Parallel: disabling cudagraphs since DP "
                 "with DeepEP high-throughput kernels are not CUDA Graph "
                 "compatible. The DeepEP low-latency kernels are CUDA Graph "
                 "compatible. Set the all_to_all backend to deepep_low_latency "
                 "to use those kernels instead.")
-            compilation_config.use_cudagraph = False
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
             if model_config is not None:
                 model_config.enforce_eager = True
 
@@ -454,8 +457,8 @@ def use_custom_allreduce(cls) -> bool:
         return True
 
     @classmethod
-    def get_piecewise_backend_cls(cls) -> str:
-        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+    def get_static_graph_wrapper_cls(cls) -> str:
+        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 91d5314900c8..4017f1ca7eec 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -7,7 +7,7 @@
 import sys
 from datetime import timedelta
 from platform import uname
-from typing import TYPE_CHECKING, NamedTuple, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
 import numpy as np
 import torch
@@ -137,6 +137,8 @@ class Platform:
 
     additional_env_vars: list[str] = []
 
+    _global_graph_pool: Optional[Any] = None
+
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
         """Returns the supported dtypes for the current platform."""
@@ -522,6 +524,15 @@ def __getattr__(self, key: str):
             " attribute.", self.device_type, key)
             return None
 
+    def get_global_graph_pool(self) -> Any:
+        """
+        Return the global graph pool for the this platform.
+        """
+        cls = self.__class__
+        if cls._global_graph_pool is None:
+            cls._global_graph_pool = self.graph_pool_handle()
+        return cls._global_graph_pool
+
     @classmethod
     def get_cu_count(cls, device_id: int = 0) -> int:
         """
@@ -530,11 +541,11 @@ def get_cu_count(cls, device_id: int = 0) -> int:
         raise NotImplementedError
 
     @classmethod
-    def get_piecewise_backend_cls(cls) -> str:
+    def get_static_graph_wrapper_cls(cls) -> str:
         """
-        Get piecewise backend class for piecewise graph.
+        Get static graph wrapper class for static graph.
         """
-        return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
+        return "vllm.compilation.base_static_graph.AbstractStaticGraphWrapper"
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 2d5bee5fc505..3ede86e15855 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -421,8 +421,8 @@ def is_navi(cls) -> bool:
         return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName
 
     @classmethod
-    def get_piecewise_backend_cls(cls) -> str:
-        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+    def get_static_graph_wrapper_cls(cls) -> str:
+        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index c7522a89c257..ba06abd07f08 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -99,7 +99,7 @@ def inference_mode(cls):
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        from vllm.config import CompilationLevel
+        from vllm.config import CompilationLevel, CUDAGraphMode
 
         cache_config = vllm_config.cache_config
         # For v0, the default block size is 16.
@@ -109,9 +109,17 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         # TPU only supports DYNAMO_ONCE compilation level
         if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
-            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level")
+            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level, and "
+                        "disabling cudagraph.")
             compilation_config.level = CompilationLevel.DYNAMO_ONCE
 
+        if compilation_config.cudagraph_mode is None or \
+                compilation_config.cudagraph_mode.max_cudagraph_mode() \
+                    != CUDAGraphMode.NONE:
+            logger.info("[TPU] CUDA graph is not supported on TPU, "
+                        "disabling cudagraphs.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
         if compilation_config.backend == "":
             compilation_config.backend = "openxla"
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index abd58dbbcbf4..66ebc8ad9d22 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -7,6 +7,7 @@
 import torch
 
 import vllm.envs as envs
+from vllm.config import CUDAGraphMode
 from vllm.logger import init_logger
 from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
@@ -100,16 +101,17 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Instances created using VllmConfig() typically have model_config as
         # None by default. The modification involves adding a check to prevent
         # potential null exceptions check and update model config.
-        if model_config is not None:
-            if model_config.dtype == torch.bfloat16:
-                bf16_supported = cls.device_support_bf16()
-                if not bf16_supported:
-                    model_config.dtype = torch.float16
-            if not model_config.enforce_eager:
-                logger.warning(
-                    "CUDA graph is not supported on XPU, fallback to the eager "
-                    "mode.")
-                model_config.enforce_eager = True
+        if model_config is not None and model_config.dtype == torch.bfloat16 \
+            and not cls.device_support_bf16():
+            model_config.dtype = torch.float16
+
+        compilation_config = vllm_config.compilation_config
+        if compilation_config.cudagraph_mode is None or \
+                compilation_config.cudagraph_mode.max_cudagraph_mode() \
+                    != CUDAGraphMode.NONE:
+            logger.info("[XPU] CUDA graph is not supported on XPU, "
+                        "disabling cudagraphs.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
         # check and update parallel config
         parallel_config = vllm_config.parallel_config
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index a411477bc3e3..ab7a71a399b3 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 
 import numpy as np
 import torch
@@ -154,9 +154,26 @@ def _get_sliding_window_configs(
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.NEVER if get_flash_attn_version() == 2 \
-        else AttentionCGSupport.ALWAYS
+    # FA3:
+    # Supports full cudagraphs for all cases.
+    #
+    # FA2:
+    # For FA2, a graph is captured with max_query_len=1, (which is what we
+    # capture by default for num_tokens <= max_num_seqs when there is no
+    # spec-decode) then these graphs will not work for mixed prefill-decode
+    # (unlike FA3). This is due to special max_query_len=1 packed-GQA handling
+    # in FA2.
+    # In summary if we are running with spec decodes the graphs would
+    # work for mixed prefill-decode and uniform-decode. But for non-spec decodes
+    # the graphs would not work for mixed prefill-decode; sorta the inverse
+    # of UNIFORM_SINGLE_TOKEN_DECODE.
+    # Theres probably a better way to describe this using `AttentionCGSupport`
+    # but for now just set it to `UNIFORM_BATCH` to get use to drop down
+    # to FULL_AND_PIECEWISE.
+    # TODO(luka, lucas): audit FA2 as part of:
+    #  https://github.com/vllm-project/vllm/issues/22945
+    cudagraph_support = AttentionCGSupport.ALWAYS \
+        if get_flash_attn_version() == 3 else AttentionCGSupport.UNIFORM_BATCH
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
@@ -177,17 +194,13 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
 
         self.max_num_splits = 0  # No upper bound on the number of splits.
         self.aot_schedule = (get_flash_attn_version() == 3)
-        self.use_full_cuda_graph = self.compilation_config.full_cuda_graph
-        if self.use_full_cuda_graph:
-            if not self.aot_schedule:
-                raise ValueError(
-                    "AoT scheduling is required for full cuda graph.")
-            capture_sizes = self.compilation_config.cudagraph_capture_sizes
-            if not capture_sizes:
-                raise ValueError(
-                    "cudagraph_capture_sizes should not be None when "
-                    "full_cuda_graph is True.")
-            self.max_cudagraph_size = max(capture_sizes)
+
+        self.use_full_cuda_graph = \
+            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+
+        if self.use_full_cuda_graph and self.aot_schedule:
+            self.max_cudagraph_size = self.compilation_config.max_capture_size
+
             if self.max_cudagraph_size > 992:
                 # This condition derives from FA3's internal heuristic.
                 # TODO(woosuk): Support larger cudagraph sizes.
@@ -310,9 +323,9 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
                                           seqlens=seq_lens,
                                           max_seq_len=max_seq_len,
                                           causal=causal)
-
-        if self.use_full_cuda_graph:
-            assert scheduler_metadata is not None
+        # For FA3 + full cudagraph
+        max_num_splits = 0
+        if self.use_full_cuda_graph and scheduler_metadata is not None:
             n = scheduler_metadata.shape[0]
             self.scheduler_metadata[:n] = scheduler_metadata
             # NOTE(woosuk): We should zero out the rest of the scheduler
@@ -322,14 +335,12 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
             self.scheduler_metadata[n:] = 0
             scheduler_metadata = self.scheduler_metadata[:n]
 
-        max_num_splits = 0
-        if (self.use_full_cuda_graph
-                and num_actual_tokens <= self.max_cudagraph_size):
-            # NOTE(woosuk): Setting num_splits > 1 may increase the memory
-            # usage, because the intermediate buffers of size [num_splits,
-            # num_heads, num_tokens, head_size] are allocated. Therefore,
-            # we only set num_splits when using cuda graphs.
-            max_num_splits = self.max_num_splits
+            if num_actual_tokens <= self.max_cudagraph_size:
+                # NOTE(woosuk): Setting num_splits > 1 may increase the memory
+                # usage, because the intermediate buffers of size [num_splits,
+                # num_heads, num_tokens, head_size] are allocated. Therefore,
+                # we only set num_splits when using cuda graphs.
+                max_num_splits = self.max_num_splits
 
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -350,11 +361,6 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
             causal=causal)
         return attn_metadata
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        # Full CUDA Graph always supported (FA2 support checked separately)
-        return True
-
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         return use_cascade_attention(*args, **kwargs)
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 12e5542d691c..02decb171fc0 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -17,7 +17,7 @@
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import cdiv, is_pin_memory_available
 from vllm.utils.flashinfer import use_trtllm_attention
@@ -183,8 +183,8 @@ def __post_init__(self):
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.PURE_DECODE_ONLY
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     reorder_batch_threshold: ClassVar[int] = 1
 
@@ -203,7 +203,8 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                                      self.kv_cache_spec.block_size)
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
-        self.enable_cuda_graph = self.compilation_config.full_cuda_graph
+        self.enable_cuda_graph = self.compilation_config.cudagraph_mode.\
+            decode_mode() == CUDAGraphMode.FULL
         if self.enable_cuda_graph:
             # For full cudagraph capture, one `decode_wrapper` for each batch
             # size is needed for FlashInfer.
@@ -586,10 +587,6 @@ def build_for_cudagraph_capture(
 
         return self.build(0, m)
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        return common_attn_metadata.max_query_len == 1
-
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype:
             # TODO: The cascade wrapper currently does not support setting
diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py
index 3f84f8967db7..ace078e2b27c 100644
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@@ -89,8 +89,8 @@ class Mamba2AttentionMetadata:
 
 class Mamba2AttentionMetadataBuilder(
         AttentionMetadataBuilder[Mamba2AttentionMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.PURE_DECODE_ONLY
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     reorder_batch_threshold: ClassVar[int] = 1
 
@@ -203,7 +203,3 @@ def build_for_cudagraph_capture(
         m.max_query_len = 1  # decode-only
 
         return self.build(0, m)
-
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        return common_attn_metadata.max_query_len == 1
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 85f6b56b5503..693e3974be32 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -597,7 +597,7 @@ def build_for_cudagraph_capture(
             "MLA only supports decode-only full CUDAGraph capture. " \
             "Make sure all cudagraph capture sizes <= max_num_seq."
 
-        m.max_query_len = 1  # decode-only
+        assert m.max_query_len == 1  # decode-only
 
         return self.build(0, m)
 
@@ -750,10 +750,6 @@ def build(self,
 
         return attn_metadata
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        return common_attn_metadata.max_query_len == 1
-
 
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index b076613c8645..6e1e5d6533da 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -22,7 +22,7 @@
 class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
     # enable full CUDA Graph support for decode-only capture
     attn_cudagraph_support: ClassVar[
-        AttentionCGSupport] = AttentionCGSupport.PURE_DECODE_ONLY
+        AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
 
 class CutlassMLABackend(MLACommonBackend):
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 2b0f52cf80bf..11674423400c 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -55,8 +55,8 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.PURE_DECODE_ONLY
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_BATCH
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
@@ -73,7 +73,7 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
         device_properties = torch.cuda.get_device_properties(self.device)
         num_sms = device_properties.multi_processor_count
 
-        if self.compilation_config.full_cuda_graph:
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             self.cg_buf_tile_scheduler_metadata = torch.zeros(
                 # Upper bound on size (<= #SMs, TileSchedulerMetaDataSize)
                 # TileSchedulerMetaDataSize = 8
@@ -95,7 +95,10 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
             1, # MQA for the decode path
         )
 
-        if self.compilation_config.full_cuda_graph:
+        # TODO: we can disambiguate between decode and mixed-prefill decode here
+        # so we can only use the persistent buffer if a cudagraph is actually
+        # being used.
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             assert self.cg_buf_tile_scheduler_metadata is not None
             assert self.cg_buf_num_splits is not None
 
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 8b55e1a30199..082c7e6f7c62 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -65,8 +65,10 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 
 
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.PURE_DECODE_ONLY
+    # TODO(luka, lucas): audit this as part of:
+    #  https://github.com/vllm-project/vllm/issues/22945
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
@@ -82,7 +84,10 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
         max_num_pages = max_num_reqs * max_num_pages_per_req
 
         # Preparing persistent buffers
-        if vllm_config.compilation_config.full_cuda_graph:
+        # TODO: we can disambiguate between decode and mixed-prefill decode here
+        # so we can only use the persistent buffer if a cudagraph is actually
+        # being used.
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
                                                dtype=torch.int32,
                                                device=device)
@@ -120,7 +125,7 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
             block_table_bounds.cumsum(dim=0, dtype=torch.int32)
         ])
 
-        if self.compilation_config.full_cuda_graph:
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
 
             num_actual_pages = paged_kv_indices.size(0)
 
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index e8bffbef4415..7d09ac0a4a3a 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -311,11 +311,6 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
         )
         return attn_metadata
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        # Full CUDA Graph always supported (FA2 support checked separately)
-        return True
-
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         return False
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index c33afbfebcde..48a9af3decac 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -58,8 +58,7 @@ class TritonAttentionMetadata:
 
 class TritonAttentionMetadataBuilder(
         AttentionMetadataBuilder[TritonAttentionMetadata]):
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.ALWAYS
+    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.ALWAYS
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):
@@ -132,11 +131,6 @@ def build(self,
         )
         return attn_metadata
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        # Full CUDA Graph always supported
-        return True
-
 
 class TritonAttentionBackend(AttentionBackend):
 
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 91eb84245ac0..1c7d08798964 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -158,18 +158,21 @@ class AttentionCGSupport(enum.Enum):
     Here we do not consider the cascade attention, as currently
     it is never cudagraph supported."""
 
+    ALWAYS = 3
+    """Cudagraph always supported; supports mixed-prefill-decode"""
+    UNIFORM_BATCH = 2
+    """Cudagraph supported for batches the only contain query lengths that are
+    the same, this can be used for spec-decode 
+        i.e. "decodes" are 1 + num_speculative_tokens"""
+    UNIFORM_SINGLE_TOKEN_DECODE = 1
+    """Cudagraph supported for batches the only contain query_len==1 decodes"""
     NEVER = 0
     """NO cudagraph support"""
-    PURE_DECODE_ONLY = 1
-    """Cudagraph supported for pure decode, need to run without
-    cudagraph for mixed prefill-decode batches"""
-    ALWAYS = 2
-    """Cudagraph always supported"""
 
 
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
-    # Does this backend/builder support CUDA Graphs for attention.
-    attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
+    # Does this backend/builder support CUDA Graphs for attention (default: no).
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
         AttentionCGSupport.NEVER
     # Does this backend/builder reorder the batch?
     # If not, set this to None. Otherwise set it to the query
@@ -199,13 +202,6 @@ def build(self,
         """
         raise NotImplementedError
 
-    def can_run_in_cudagraph(
-            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
-        """
-        Can this batch (with given metadata) use CUDA Graphs for attention.
-        """
-        return False
-
     def build_for_cudagraph_capture(
             self, common_attn_metadata: CommonAttentionMetadata) -> M:
         """
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
new file mode 100644
index 000000000000..02e65820b7c0
--- /dev/null
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
+from vllm.forward_context import BatchDescriptor
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class CudagraphDispatcher:
+    """
+    Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
+
+    The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
+    for FULL cudagraph runtime mode. The keys are initialized depending on 
+    attention support and what cudagraph mode is set in CompilationConfig. The 
+    keys stored in dispatcher are the only source of truth for valid
+    cudagraphs that can be dispatched at runtime.
+
+    At runtime, the dispatch method generates the runtime cudagraph mode (FULL, 
+    PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
+    based on the input key. After dispatching (commuicate via forward context), 
+    the cudagraph wrappers will trust the dispatch key to do either capturing
+    or replaying (if mode matched), or pass through to the underlying runnable 
+    without cudagraph (if mode no match or mode is NONE).
+    """
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+        self.cudagraph_mode = self.compilation_config.cudagraph_mode
+
+        # Dict to store valid cudagraph dispatching keys.
+        self.cudagraph_keys: dict[CUDAGraphMode, set[BatchDescriptor]] = {
+            CUDAGraphMode.PIECEWISE: set(),
+            CUDAGraphMode.FULL: set(),
+        }
+
+        assert not self.cudagraph_mode.requires_piecewise_compilation() or \
+            (self.compilation_config.level == CompilationLevel.PIECEWISE and
+             self.compilation_config.splitting_ops_contain_attention()), \
+            "Compilation level should be CompilationLevel.PIECEWISE when "\
+            "cudagraph_mode piecewise cudagraphs is used, "\
+            f"cudagraph_mode={self.cudagraph_mode}, "\
+            f"compilation_level={self.compilation_config.level}, "\
+            f"splitting_ops={self.compilation_config.splitting_ops}"
+
+        self.keys_initialized = False
+
+    def add_cudagraph_key(self, runtime_mode: CUDAGraphMode,
+                          batch_descriptor: BatchDescriptor):
+        assert runtime_mode in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL], \
+            f"Invalid cudagraph runtime mode: {runtime_mode}"
+        self.cudagraph_keys[runtime_mode].add(batch_descriptor)
+
+    def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode,
+                                  uniform_decode_query_len: int):
+        # This should be called only after attention backend is initialized.
+
+        # Note: we create all valid keys possible for cudagraph but do not
+        # guarantee all keys would be used. For example, we create keys for
+        # piecewise cudagraphs when it is piecewise compilation, which is always
+        # valid, but for attention backend support unified routine, we may not
+        # trigger capturing/replaying the piecewise cudagraphs depending on
+        # CompilationConfig.cudagraph_mode. In addition, if we allow lazy
+        # capturing in future PR, some keys may never be triggered.
+        if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
+            for bs in self.compilation_config.cudagraph_capture_sizes:
+                self.add_cudagraph_key(
+                    cudagraph_mode.mixed_mode(),
+                    BatchDescriptor(num_tokens=bs, uniform_decode=False))
+
+        # if decode cudagraph mode is FULL, and we don't already have mixed
+        # mode full cudagraphs then add them here.
+        if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL \
+            and cudagraph_mode.separate_routine():
+            max_num_tokens = uniform_decode_query_len * \
+                self.vllm_config.scheduler_config.max_num_seqs
+            cudagraph_capture_sizes_for_decode = [
+                x for x in self.compilation_config.cudagraph_capture_sizes
+                if x <= max_num_tokens and x >= uniform_decode_query_len
+            ]
+            for bs in cudagraph_capture_sizes_for_decode:
+                self.add_cudagraph_key(
+                    CUDAGraphMode.FULL,
+                    BatchDescriptor(num_tokens=bs, uniform_decode=True))
+        self.keys_initialized = True
+
+    def dispatch(
+        self, batch_descriptor: BatchDescriptor
+    ) -> tuple[CUDAGraphMode, Optional[BatchDescriptor]]:
+        """
+        Given a batch descriptor, dispatch to a cudagraph mode.
+        A new batch descriptor is returned as we might dispatch a uniform batch 
+        to a graph that supports a more general batch (uniform to non-uniform).
+        """
+        # if not initialized, just skip dispatching.
+        if not self.keys_initialized:
+            logger.warning_once("cudagraph dispatching keys are not "
+                                "initialized. No cudagraph will be used.")
+            return CUDAGraphMode.NONE, None
+
+        # check if key exists for full cudagraph
+        if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
+            return CUDAGraphMode.FULL, batch_descriptor
+
+        # otherwise, check if non-uniform key exists
+        non_uniform_key = batch_descriptor.non_uniform
+        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
+            return CUDAGraphMode.FULL, non_uniform_key
+
+        # also check if non-uniform key exists for more "general"
+        # piecewise cudagraph
+        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
+            return CUDAGraphMode.PIECEWISE, non_uniform_key
+
+        # finally, just return no cudagraphs
+        return CUDAGraphMode.NONE, None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d5325287889f..9460d91c5832 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -21,7 +21,9 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.counter import compilation_counter
-from vllm.config import (CompilationLevel, VllmConfig,
+from vllm.compilation.cuda_graph import CUDAGraphWrapper
+from vllm.compilation.monitor import set_cudagraph_capturing_enabled
+from vllm.config import (CompilationLevel, CUDAGraphMode, VllmConfig,
                          get_layers_from_vllm_config, update_config)
 from vllm.distributed.eplb.eplb_state import EplbState
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
@@ -29,7 +31,8 @@
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tp_group, graph_capture, is_global_first_rank,
     prepare_communication_buffer_for_model)
-from vllm.forward_context import DPMetadata, set_forward_context
+from vllm.forward_context import (BatchDescriptor, DPMetadata,
+                                  set_forward_context)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaBase
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -48,13 +51,15 @@
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        GiB_bytes, LazyLoader, check_use_alibi, get_dtype_size,
-                        is_pin_memory_available, round_up, supports_dynamo)
+                        GiB_bytes, LazyLoader, cdiv, check_use_alibi,
+                        get_dtype_size, is_pin_memory_available, round_up,
+                        supports_dynamo)
 from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     make_kv_sharing_fast_prefill_attention_metadata,
     reorder_batch_to_split_decodes_and_prefills)
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.v1.kv_cache_interface import (AttentionSpec,
                                         ChunkedLocalAttentionSpec,
                                         FullAttentionSpec, KVCacheConfig,
@@ -218,11 +223,6 @@ def __init__(
             is_spec_decode=bool(self.vllm_config.speculative_config),
         )
 
-        self.use_cuda_graph = (
-            self.vllm_config.compilation_config.level
-            == CompilationLevel.PIECEWISE
-            and self.vllm_config.compilation_config.use_cudagraph
-            and not self.model_config.enforce_eager)
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
@@ -230,8 +230,6 @@ def __init__(
         self.cudagraph_batch_sizes = list(
             reversed(self.compilation_config.cudagraph_capture_sizes))
 
-        self.full_cuda_graph = self.compilation_config.full_cuda_graph
-
         # Cache the device properties.
         self._init_device_properties()
 
@@ -326,6 +324,12 @@ def __init__(
             self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
                 self.max_num_tokens, dtype=torch.int32, device=self.device)
 
+        self.uniform_decode_query_len = 1 if not self.speculative_config else \
+            1 + self.speculative_config.num_speculative_tokens
+
+        # Cudagraph dispatcher for runtime cudagraph dispatching.
+        self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
+
         self.mm_budget = (MultiModalBudget(
             self.model_config,
             self.scheduler_config,
@@ -471,7 +475,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 assert (task := pooling_params.task) is not None, (
                     "You did not set `task` in the API")
 
-                model = cast(VllmModelForPooling, self.model)
+                model = cast(VllmModelForPooling, self.get_model())
                 to_update = model.pooler.get_pooling_updates(task)
                 to_update.apply(pooling_params)
 
@@ -679,13 +683,11 @@ def _get_cumsum_and_arange(
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[dict[str,
-                    Any], bool, torch.Tensor, Optional[SpecDecodeMetadata],
-               np.ndarray, Optional[CommonAttentionMetadata]]:
+    ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata],
+               np.ndarray, Optional[CommonAttentionMetadata], int]:
         """
         :return: tuple[
             attn_metadata: layer-to-attention_metadata mapping,
-            attention_cuda_graphs: whether attention can run in cudagraph
             logits_indices, spec_decode_metadata
         ]
         """
@@ -820,7 +822,7 @@ def _prepare_inputs(
             # valid, we fill the padded indices with the last index.
             self.kv_sharing_fast_prefill_logits_indices[num_logits:].fill_(
                 logits_indices[-1].item())
-            if (self.use_cuda_graph
+            if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
                     and num_logits <= self.cudagraph_batch_sizes[-1]):
                 # Use piecewise CUDA graphs.
                 # Add padding to the batch size.
@@ -925,17 +927,13 @@ def _prepare_inputs(
                         continue
                     attn_metadata[layer_name] = attn_metadata_i
 
-        attention_cuda_graphs = all(
-            g.metadata_builder.can_run_in_cudagraph(common_attn_metadata)
-            for g in self._attn_group_iterator())
-
         # Hot-Swap lora model
         if self.lora_config:
             self.set_active_loras(self.input_batch, num_scheduled_tokens)
 
-        return (attn_metadata, attention_cuda_graphs, logits_indices,
-                spec_decode_metadata, num_scheduled_tokens,
-                spec_decode_common_attn_metadata)
+        return (attn_metadata, logits_indices, spec_decode_metadata,
+                num_scheduled_tokens, spec_decode_common_attn_metadata,
+                max_num_scheduled_tokens)
 
     def _compute_cascade_attn_prefix_len(
         self,
@@ -1259,6 +1257,9 @@ def _gather_mm_embeddings(
         return mm_embeds
 
     def get_model(self) -> nn.Module:
+        # get raw model out of the cudagraph wrapper.
+        if isinstance(self.model, CUDAGraphWrapper):
+            return self.model.unwrap()
         return self.model
 
     def get_supported_generation_tasks(self) -> list[GenerationTask]:
@@ -1415,9 +1416,10 @@ def eplb_step(self,
             return
 
         assert self.eplb_state is not None
-        assert is_mixture_of_experts(self.model)
+        model = self.get_model()
+        assert is_mixture_of_experts(model)
         self.eplb_state.step(
-            self.model,
+            model,
             is_dummy,
             is_profile,
             log_stats=self.parallel_config.eplb_log_balancedness,
@@ -1507,15 +1509,14 @@ def execute_model(
                                                 self.vllm_config)
 
         # Prepare the decoder inputs.
-        (attn_metadata, attention_cuda_graphs, logits_indices,
-         spec_decode_metadata, num_scheduled_tokens_np,
-         spec_decode_common_attn_metadata) = (
-             self._prepare_inputs(scheduler_output))
+        (attn_metadata, logits_indices, spec_decode_metadata,
+         num_scheduled_tokens_np, spec_decode_common_attn_metadata,
+         max_query_len) = (self._prepare_inputs(scheduler_output))
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        if (self.use_cuda_graph
+        if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
-            # Use piecewise CUDA graphs.
+            # Use CUDA graphs.
             # Add padding to the batch size.
             num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 num_scheduled_tokens)
@@ -1581,10 +1582,12 @@ def execute_model(
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True)
 
-        # Some attention backends only support CUDA Graphs in pure decode.
-        # If attention doesn't support CUDA Graphs for this batch, but we
-        # compiled with full CUDA graphs, we have to skip them entirely.
-        skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
+        uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
+            num_scheduled_tokens == self.input_batch.num_reqs * max_query_len)
+        batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
+                                           uniform_decode=uniform_decode)
+        cudagraph_runtime_mode, batch_descriptor = \
+            self.cudagraph_dispatcher.dispatch(batch_descriptor)
 
         # Run the model.
         # Use persistent buffers for CUDA graphs.
@@ -1593,10 +1596,10 @@ def execute_model(
                 self.vllm_config,
                 num_tokens=num_input_tokens,
                 num_tokens_across_dp=num_tokens_across_dp,
-                skip_cuda_graphs=skip_cuda_graphs,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+                batch_descriptor=batch_descriptor,
         ), self.maybe_get_kv_connector_output(
                 scheduler_output) as kv_connector_output:
-
             model_output = self.model(
                 input_ids=input_ids,
                 positions=positions,
@@ -2021,20 +2024,31 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             self.model.compile(
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
                 backend=backend)
+            return
+        # for other compilation levels, cudagraph behavior is controlled by
+        # CudagraphWraper and CudagraphDispatcher of vllm.
+
+        # wrap the model with full cudagraph wrapper if needed.
+        if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
+            self.model = CUDAGraphWrapper(self.model,
+                                          self.vllm_config,
+                                          runtime_mode=CUDAGraphMode.FULL)
 
     def reload_weights(self) -> None:
         assert getattr(self, "model", None) is not None, \
             "Cannot reload weights before model is loaded."
         model_loader = get_model_loader(self.load_config)
         logger.info("Reloading weights inplace...")
-        model_loader.load_weights(self.model, model_config=self.model_config)
+        model = self.get_model()
+        model_loader.load_weights(model, model_config=self.model_config)
 
     def save_tensorized_model(
         self,
         tensorizer_config: "TensorizerConfig",
     ) -> None:
+        model = self.get_model()
         TensorizerLoader.save_model(
-            self.model,
+            model,
             tensorizer_config=tensorizer_config,
             model_config=self.model_config,
         )
@@ -2210,31 +2224,82 @@ def _get_mm_dummy_batch(
     def _dummy_run(
         self,
         num_tokens: int,
-        capture_attn_cudagraph: bool = False,
+        cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
+        force_attention: bool = False,
+        uniform_decode: bool = False,
         skip_eplb: bool = False,
         is_profile: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Run a dummy forward pass to warm up/profile run or capture the
+        CUDA graph for the model.
+
+        Args:
+            num_tokens: Number of tokens to run the dummy forward pass.
+            cudagraph_runtime_mode: used to control the behavior.
+                - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run
+                - CUDAGraphMode.PIECEWISE: Piecewise cudagraph.
+                - CUDAGraphMode.FULL: Full cudagraph, attention metadata is
+                    needed.
+            force_attention: If True, always create attention metadata. Used to 
+                warm up attention backend when mode is NONE.
+            uniform_decode: If True, the batch is a uniform decode batch.
+            skip_eplb: If True, skip EPLB state update.
+            is_profile: If True, this is a profile run.
+        """
+        assert cudagraph_runtime_mode in {
+            CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
+        }
 
         # Padding for DP
         num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
         num_tokens += num_pad
 
+        # If cudagraph_mode.decode_mode() == FULL and
+        # cudagraph_mode.seperate_routine(). This means that we are using
+        # different graphs and/or modes for mixed prefill-decode batches vs.
+        # uniform decode batches. A uniform decode batch means that all
+        # requests have identical query length, except a potential virtual
+        # request (shorter) in the batch account for padding.
+        # Uniform decode batch could either be common pure decode, where
+        # max_query_len == 1, or speculative decode, where
+        # max_query_len == 1 + num_spec_decode_tokens.
+
+        # When setting max_query_len = 1, we switch to and capture the optimized
+        # routine of FA2 for pure decode, i.e., Flashdecode + an optimization
+        # for GQA/MQA.
+        max_query_len = self.uniform_decode_query_len if uniform_decode else \
+                                                                num_tokens
+
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
         # has num_tokens in total.
         assert num_tokens <= self.scheduler_config.max_num_batched_tokens
         max_num_reqs = self.scheduler_config.max_num_seqs
-        num_reqs = min(num_tokens, max_num_reqs)
-        min_tokens_per_req = num_tokens // num_reqs
-        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
-        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+        if uniform_decode:
+            num_reqs = cdiv(num_tokens, max_query_len)
+            assert num_reqs <= max_num_reqs, \
+                "Do not capture num_reqs > max_num_reqs for uniform batch"
+            num_scheduled_tokens_list = [max_query_len] * num_reqs
+            if num_tokens % max_query_len != 0:
+                num_scheduled_tokens_list[-1] = num_tokens % max_query_len
+        else:
+            num_reqs = min(num_tokens, max_num_reqs)
+            min_tokens_per_req = num_tokens // num_reqs
+            num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
+            num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
         num_scheduled_tokens = np.array(num_scheduled_tokens_list,
                                         dtype=np.int32)
 
         attn_metadata: Optional[dict[str, Any]] = None
-        if capture_attn_cudagraph:
+
+        # If force_attention is True, we always capture attention. Otherwise,
+        # it only happens for cudagraph_runtime_mode=FULL.
+        if force_attention or cudagraph_runtime_mode == \
+                CUDAGraphMode.FULL:
             attn_metadata = {}
 
             # Make sure max_model_len is used at the graph capture time.
@@ -2255,7 +2320,7 @@ def _dummy_run(
                     num_computed_tokens_cpu_tensor[:num_reqs],
                     num_reqs=num_reqs,
                     num_actual_tokens=num_tokens,
-                    max_query_len=num_tokens,
+                    max_query_len=max_query_len,
                     block_table_tensor=self.input_batch.block_table[
                         kv_cache_group_id].get_device_tensor()[:num_reqs],
                     slot_mapping=self.input_batch.
@@ -2299,12 +2364,26 @@ def _dummy_run(
 
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                     num_tokens, None, False)
+            if cudagraph_runtime_mode == CUDAGraphMode.NONE:
+                batch_descriptor = None
+            else:
+                # filter out the valid batch descriptor
+                _cg_mode, batch_descriptor = \
+                    self.cudagraph_dispatcher.dispatch(
+                        BatchDescriptor(num_tokens=num_tokens,
+                                        uniform_decode=uniform_decode))
+                # sanity check
+                assert cudagraph_runtime_mode == _cg_mode, (
+                    f"Cudagraph runtime mode mismatch at dummy_run. "
+                    f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}.")
 
             with self.maybe_randomize_inputs(input_ids), set_forward_context(
                     attn_metadata,
                     self.vllm_config,
                     num_tokens=num_tokens,
-                    num_tokens_across_dp=num_tokens_across_dp):
+                    num_tokens_across_dp=num_tokens_across_dp,
+                    cudagraph_runtime_mode=cudagraph_runtime_mode,
+                    batch_descriptor=batch_descriptor):
                 outputs = self.model(
                     input_ids=input_ids,
                     positions=positions,
@@ -2436,7 +2515,7 @@ def _dummy_pooler_run_task(
                                       dtype=torch.int32,
                                       device=self.device)
 
-        model = cast(VllmModelForPooling, self.model)
+        model = cast(VllmModelForPooling, self.get_model())
         dummy_pooling_params = PoolingParams(task=task)
         to_update = model.pooler.get_pooling_updates(task)
         to_update.apply(dummy_pooling_params)
@@ -2546,12 +2625,13 @@ def profile_run(self) -> None:
         gc.collect()
 
     def capture_model(self) -> None:
-        if not self.use_cuda_graph:
+        if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
             logger.warning(
                 "Skipping CUDA graph capture. To turn on CUDA graph capture, "
-                "set -O %s and ensure `use_cudagraph` was not manually set to "
-                "False", CompilationLevel.PIECEWISE)
+                "ensure `cudagraph_mode` was not manually set to `NONE`")
             return
+        else:
+            self.initialize_cudagraph_capture()
 
         compilation_counter.num_gpu_runner_capture_triggers += 1
 
@@ -2576,25 +2656,41 @@ def freeze_gc():
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
+        set_cudagraph_capturing_enabled(True)
         with freeze_gc(), graph_capture(device=self.device):
-            full_cg = self.full_cuda_graph
-            # Only rank 0 should print progress bar during capture
-            compilation_cases = reversed(self.cudagraph_batch_sizes)
-            if is_global_first_rank():
-                compilation_cases = tqdm(
-                    list(compilation_cases),
-                    disable=not self.load_config.use_tqdm_on_load,
-                    desc="Capturing CUDA graph shapes")
-            for num_tokens in compilation_cases:
-                # We skip EPLB here since we don't want to record dummy metrics
-                for _ in range(
-                        self.compilation_config.cudagraph_num_of_warmups):
-                    self._dummy_run(num_tokens,
-                                    capture_attn_cudagraph=full_cg,
-                                    skip_eplb=True)
-                self._dummy_run(num_tokens,
-                                capture_attn_cudagraph=full_cg,
-                                skip_eplb=True)
+            cudagraph_mode = self.compilation_config.cudagraph_mode
+            if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
+                cudagraph_runtime_mode = cudagraph_mode.mixed_mode()
+
+                compilation_cases = list(reversed(self.cudagraph_batch_sizes))
+                self._capture_cudagraphs(
+                    compilation_cases,
+                    cudagraph_runtime_mode=cudagraph_runtime_mode,
+                    uniform_decode=False)
+
+            # Capture full cudagraph for uniform decode batches if we have
+            # dont already have full mixed prefill-decode cudagraphs
+            if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL and \
+                cudagraph_mode.separate_routine():
+                max_num_tokens = self.scheduler_config.max_num_seqs * \
+                        self.uniform_decode_query_len
+                decode_cudagraph_batch_sizes = [
+                    x for x in self.cudagraph_batch_sizes if
+                    x <= max_num_tokens and x >= self.uniform_decode_query_len
+                ]
+                compilation_cases_decode = list(
+                    reversed(decode_cudagraph_batch_sizes))
+                self._capture_cudagraphs(
+                    compilation_cases=compilation_cases_decode,
+                    cudagraph_runtime_mode=CUDAGraphMode.FULL,
+                    uniform_decode=True)
+
+        # Disable cudagraph capturing globally, so any unexpected cudagraph
+        # capturing will be detected and raise an error after here.
+        # Note: We don't put it into graph_capture context manager because
+        # we may doing lazy capturing in future that still allows capturing
+        # after here.
+        set_cudagraph_capturing_enabled(False)
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
@@ -2604,6 +2700,41 @@ def freeze_gc():
         logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
                     elapsed_time, cuda_graph_size / (1 << 30))
 
+    def _capture_cudagraphs(self, compilation_cases: list[int],
+                            cudagraph_runtime_mode: CUDAGraphMode,
+                            uniform_decode: bool):
+        assert cudagraph_runtime_mode != CUDAGraphMode.NONE and \
+            cudagraph_runtime_mode in [CUDAGraphMode.FULL,
+                                        CUDAGraphMode.PIECEWISE]
+
+        # Only rank 0 should print progress bar during capture
+        if is_global_first_rank():
+            compilation_cases = tqdm(
+                compilation_cases,
+                disable=not self.load_config.use_tqdm_on_load,
+                desc="Capturing CUDA graphs ({}, {})".format(
+                    "decode" if uniform_decode else "mixed prefill-decode",
+                    cudagraph_runtime_mode.name))
+        # We skip EPLB here since we don't want to record dummy metrics
+        for num_tokens in compilation_cases:
+            for _ in range(self.compilation_config.cudagraph_num_of_warmups):
+                # Use CUDAGraphRuntimeStyle.NONE (default) for warmup.
+                # But be careful, warm up with `NONE`is orthogonal to
+                # if we want to warm up attention or not. This is
+                # different from the case where `FULL` implies capture
+                # attention while `PIECEWISE` implies no attention.
+                force_attention = (
+                    cudagraph_runtime_mode == CUDAGraphMode.FULL)
+                self._dummy_run(num_tokens,
+                                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                                force_attention=force_attention,
+                                uniform_decode=uniform_decode,
+                                skip_eplb=True)
+            self._dummy_run(num_tokens,
+                            cudagraph_runtime_mode=cudagraph_runtime_mode,
+                            uniform_decode=uniform_decode,
+                            skip_eplb=True)
+
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize the attention backends and attention metadata builders.
@@ -2648,25 +2779,6 @@ def create_attn_groups(
                                             attn_metadata_builder_i,
                                             layer_names)
                 attn_groups.append(attn_group)
-
-                if self.full_cuda_graph:
-                    if attn_metadata_builder_i.attn_cudagraph_support == \
-                        AttentionCGSupport.NEVER:
-                        raise ValueError(
-                            f"Full CUDAGraph not supported for "
-                            f"{attn_backend.__name__}. Turn off "
-                            f"CompilationConfig.full_cuda_graph or use a "
-                            f" different attention backend.")
-                    if attn_metadata_builder_i.attn_cudagraph_support == \
-                        AttentionCGSupport.PURE_DECODE_ONLY:
-                        # Limit the max cudagraph size to the max number of
-                        # sequences for pure decode only cudagraph backend,
-                        # whose max_query_len is 1.
-                        self.cudagraph_batch_sizes = [
-                            size for size in self.cudagraph_batch_sizes
-                            if size <= self.scheduler_config.max_num_seqs
-                        ]
-
             return attn_groups
 
         for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
@@ -2734,6 +2846,75 @@ def create_attn_groups(
                 "All or none of the layers are expected to be encoder-only"
             self.is_encoder_only_model = True
 
+    def initialize_cudagraph_capture(self) -> None:
+        min_cg_support = AttentionCGSupport.ALWAYS
+        min_cg_builder_name = None
+
+        for attn_group in self._attn_group_iterator():
+            builder = attn_group.metadata_builder
+            if builder.cudagraph_support.value < min_cg_support.value:
+                min_cg_support = builder.cudagraph_support
+                min_cg_builder_name = builder.__class__.__name__
+
+        # Flexible resolve the cudagraph mode
+        cudagraph_mode = self.compilation_config.cudagraph_mode
+        # check cudagraph for mixed batch is supported
+        if cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL \
+            and min_cg_support != AttentionCGSupport.ALWAYS:
+            msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
+                   f"with {min_cg_builder_name} backend (support: "
+                   f"{min_cg_support})")
+            if min_cg_support == AttentionCGSupport.NEVER:
+                # if not supported any full cudagraphs, just raise it.
+                msg += "; please try cudagraph_mode=PIECEWISE, and "\
+                    "make sure compilation level is piecewise"
+                raise ValueError(msg)
+
+            # attempt to resolve the full cudagraph related mode
+            if self.compilation_config.splitting_ops_contain_attention():
+                msg += "; setting cudagraph_mode=FULL_AND_PIECEWISE"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.FULL_AND_PIECEWISE
+            else:
+                msg += "; setting cudagraph_mode=FULL_DECODE_ONLY"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.FULL_DECODE_ONLY
+            logger.warning(msg)
+
+        # check that if we are doing spec-decode + decode full-cudagraphs it is
+        # supported
+        if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+                and self.uniform_decode_query_len > 1 and min_cg_support.value
+                < AttentionCGSupport.UNIFORM_BATCH.value):
+            msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported"
+                   f" with spec-decode for attention backend "
+                   f"{min_cg_builder_name} (support: {min_cg_support})")
+            if self.compilation_config.splitting_ops_contain_attention():
+                msg += "; setting cudagraph_mode=PIECEWISE"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.PIECEWISE
+            else:
+                msg += "; setting cudagraph_mode=NONE"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.NONE
+            logger.warning(msg)
+
+        # double check that we can support full cudagraph if they are requested
+        # even after automatic downgrades
+        if cudagraph_mode.has_full_cudagraphs() \
+            and min_cg_support == AttentionCGSupport.NEVER:
+            raise ValueError(f"CUDAGraphMode.{cudagraph_mode.name} is not "
+                             f"supported with {min_cg_builder_name} backend ("
+                             f"support:{min_cg_support}) "
+                             "; please try cudagraph_mode=PIECEWISE, "
+                             "and make sure compilation level is piecewise")
+
+        # Trigger cudagraph dispatching keys initialization here (after
+        # initializing attn backends).
+        self.cudagraph_dispatcher.initialize_cudagraph_keys(
+            self.compilation_config.cudagraph_mode,
+            self.uniform_decode_query_len)
+
     def calculate_reorder_batch_threshold(self) -> None:
         """
         Check that if any backends reorder batches; that the reordering
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 84f065f25f2e..04de8d36680a 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -322,16 +322,11 @@ def compile_or_warm_up_model(self) -> None:
         if get_pp_group().is_last_rank:
             max_num_reqs = min(self.scheduler_config.max_num_seqs,
                                self.scheduler_config.max_num_batched_tokens)
-            # activate building attn_metadata for this dummy run to avoid
-            # potential illegal memory access for full cudagraph relay.
-            attn_cudagraph = self.compilation_config.full_cuda_graph and\
-                not self.model_config.enforce_eager
 
             # We skip EPLB here since we don't want to record dummy metrics
             hidden_states, last_hidden_states = \
                 self.model_runner._dummy_run(
                     num_tokens=max_num_reqs,
-                    capture_attn_cudagraph=attn_cudagraph,
                     skip_eplb=True,
                 )
             if self.model_runner.is_pooling_model:

From 00771a0c792e6720d55a44d29772deddfdd2213d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 15 Aug 2025 08:22:31 -0700
Subject: [PATCH 097/233] [V0 Deprecation] Remove advance_step (#22969)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 CMakeLists.txt                                |   1 -
 csrc/ops.h                                    |  16 -
 csrc/prepare_inputs/advance_step.cu           | 336 ------------------
 csrc/prepare_inputs/advance_step.cuh          |  19 -
 csrc/torch_bindings.cpp                       |  19 -
 vllm/_custom_ops.py                           |  32 --
 vllm/attention/backends/abstract.py           |   5 -
 .../backends/differential_flash_attn.py       |  76 +---
 vllm/attention/backends/flash_attn.py         |  76 +---
 vllm/attention/backends/flashinfer.py         |  65 +---
 vllm/attention/backends/flashmla.py           |  15 +-
 vllm/attention/backends/mla/common.py         |  87 +----
 vllm/attention/backends/placeholder_attn.py   |  62 +---
 vllm/attention/backends/rocm_aiter_mla.py     |  21 --
 vllm/attention/backends/rocm_flash_attn.py    |  68 +---
 vllm/worker/model_runner.py                   |   3 +-
 16 files changed, 9 insertions(+), 892 deletions(-)
 delete mode 100644 csrc/prepare_inputs/advance_step.cu
 delete mode 100644 csrc/prepare_inputs/advance_step.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcec854a0872..cda1ffc795d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,7 +249,6 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/quantization/activation_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
-  "csrc/prepare_inputs/advance_step.cu"
   "csrc/custom_all_reduce.cu"
   "csrc/torch_bindings.cpp")
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 207291eceb16..3e29f0a973dd 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -145,22 +145,6 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_quick(torch::Tensor& out, torch::Tensor& input);
 
-void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                            int64_t block_size, torch::Tensor& input_tokens,
-                            torch::Tensor& sampled_token_ids,
-                            torch::Tensor& input_positions,
-                            torch::Tensor& seq_lens,
-                            torch::Tensor& slot_mapping,
-                            torch::Tensor& block_tables);
-
-void advance_step_flashinfer(
-    int64_t num_seqs, int64_t num_queries, int64_t block_size,
-    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-    torch::Tensor& input_positions, torch::Tensor& seq_lens,
-    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
-    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
-    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
-
 void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
                         torch::Tensor const& q_pe,
                         torch::Tensor const& kv_c_and_k_pe_cache,
diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
deleted file mode 100644
index 3d5077d9de46..000000000000
--- a/csrc/prepare_inputs/advance_step.cu
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * The goal of this GPU kernel is to advance input tensors on the GPU directly
- * PR: https://github.com/vllm-project/vllm/pull/6338
- * Current restrictions:
- *     1. Specialized for DraftModelRunner
- *     2. Supports flash_attn only
- */
-
-#include "advance_step.cuh"
-
-namespace prepare_inputs {
-
-//
-template <int const num_threads>
-__global__ void advance_step_flashattn_kernel(
-    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
-    long const* sampled_token_ids_ptr, long* input_positions_ptr,
-    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
-    int64_t const block_tables_stride) {
-  int const n_pad = num_seqs - num_queries;
-  if (n_pad && blockIdx.x == 0) {
-    // Handle cuda graph padding
-    int const offset = num_queries;
-    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
-      input_tokens_ptr[offset + i] = 0;
-      input_positions_ptr[offset + i] = 0;
-      slot_mapping_ptr[offset + i] = -1;
-    }
-  }
-
-  int num_query_blocks = div_ceil(num_queries, num_threads);
-
-  if (blockIdx.x >= num_query_blocks) {
-    return;
-  }
-
-  int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
-
-  if (cur_query_id >= num_queries) {
-    return;
-  }
-
-  // Update input_tokens
-  input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
-
-  int seq_len = seq_lens_ptr[cur_query_id];
-  int next_seq_len = seq_len + 1;
-  int next_input_pos = next_seq_len - 1;
-
-  // Update seq_lens
-  seq_lens_ptr[cur_query_id] = next_seq_len;
-  // Update input_positions
-  input_positions_ptr[cur_query_id] = next_input_pos;
-
-  int const* seq_block_tables_ptr =
-      block_tables_ptr + block_tables_stride * cur_query_id;
-
-  int block_index = next_input_pos / block_size;
-  int block_offset = next_input_pos % block_size;
-
-  int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
-  // Update slot_mapping
-  slot_mapping_ptr[cur_query_id] = slot_num;
-}
-
-inline void verify_tensor(std::string const& name, torch::Tensor const& t,
-                          int64_t const size_0, int64_t const size_1,
-                          c10::ScalarType const type) {
-  bool size_0_cond = true;
-  if (size_0 != -1) {
-    size_0_cond = t.size(0) == size_0;
-  }
-
-  bool size_1_cond = true;
-  if (size_1 != -1) {
-    size_1_cond = t.size(1) == size_1;
-  }
-
-  bool is_contiguous = t.is_contiguous();
-  bool same_type = t.dtype() == type;
-
-  bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
-  if (!pass) {
-    TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
-                " is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
-                " is not as expected: shape = [", size_0, ", ", size_1,
-                "], type = ", type);
-  }
-}
-
-/// each thread processes a block per query
-__global__ void advance_step_flashinfer_kernel(
-    int num_threads, int num_seqs, int num_queries, int block_size,
-    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
-    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
-    int const* block_tables_ptr, int64_t const block_tables_stride,
-    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
-  int const n_pad = num_seqs - num_queries;
-  if (n_pad && blockIdx.x == 0) {
-    // Handle cuda graph padding
-    int const offset = num_queries;
-    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
-      input_tokens_ptr[offset + i] = 0;
-      input_positions_ptr[offset + i] = 0;
-      slot_mapping_ptr[offset + i] = -1;
-    }
-  }
-  int num_query_blocks = div_ceil(num_queries, num_threads);
-
-  if (blockIdx.x < num_query_blocks) {
-    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
-
-    if (cur_query_id < num_queries) {
-      // Update input_tokens
-      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
-
-      int seq_len = seq_lens_ptr[cur_query_id];
-      int next_seq_len = seq_len + 1;
-      int next_input_pos = next_seq_len - 1;
-
-      // Update seq_lens
-      seq_lens_ptr[cur_query_id] = next_seq_len;
-      // Update input_positions
-      input_positions_ptr[cur_query_id] = next_input_pos;
-
-      int const* seq_block_tables_ptr =
-          block_tables_ptr + block_tables_stride * cur_query_id;
-
-      int block_index = next_input_pos / block_size;
-      int block_offset = next_input_pos % block_size;
-
-      // Update paged_kv_last_page_len
-      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
-
-      int slot_num =
-          seq_block_tables_ptr[block_index] * block_size + block_offset;
-      // Update slot_mapping
-      slot_mapping_ptr[cur_query_id] = slot_num;
-      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
-    }
-  }
-}
-
-__global__ void advance_step_flashinfer_indptr_kernel(
-    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
-    int* block_table_bound_ptr) {
-  int idx = blockIdx.x * num_threads + threadIdx.x;
-  // Update paged_kv_indptr
-  if (idx == 0) {
-    paged_kv_indptr_ptr[idx] = 0;
-  }
-  if (idx < num_queries) {
-    int sum = 0;
-    for (int i = 0; i <= idx; ++i) {
-      sum += block_table_bound_ptr[i];
-    }
-    paged_kv_indptr_ptr[idx + 1] = sum;
-  }
-}
-
-__global__ void advance_step_flashinfer_indices_kernel(
-    int num_seqs, int num_queries, int const* block_tables_ptr,
-    int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
-    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
-  // note: max_num_blocks_per_seq = block_tables.stride(0)
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // when cuda graphs are enabled, paged_kv_indptr tensor
-  // has to be updated for the padded queries
-  // tid represents a query# for paged_kv_indptr tensor
-  if (num_queries < tid && tid <= num_seqs) {
-    paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
-  }
-
-  // each thread processes a block_ptr in block_tables
-  // block_tables shape: [num_queries, max_num_blocks_per_seq]
-  // paged_kv_indices is flattened block_tables.
-  for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
-       idx += (gridDim.x * blockDim.x)) {
-    // block_tables-row = paged_kv_indptr[queryNum]
-    int queryNum = idx / max_num_blocks_per_seq;
-    int col = idx % max_num_blocks_per_seq;
-    if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
-      int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
-      int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
-      paged_kv_indices_ptr[indices_arr_idx] =
-          block_tables_ptr[block_tables_idx];
-    }
-  }
-}
-
-void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
-                            torch::Tensor& input_tokens,       // type: long
-                            torch::Tensor& sampled_token_ids,  // type: long
-                            torch::Tensor& input_positions,    // type: long
-                            torch::Tensor& seq_lens,           // type: int
-                            torch::Tensor& slot_mapping,       // type: long
-                            torch::Tensor& block_tables) {     // type: int
-
-  if (logging) {
-    printf("advance_step_flashattn:\n");
-    printf("  num_seqs = %d\n", num_seqs);
-    printf("  num_queries = %d\n", num_queries);
-    printf("  block_size = %d\n", block_size);
-  }
-  // Verify all tensors
-  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
-  verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
-                at::kLong);
-  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
-  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
-  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
-  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
-
-  int dev = sampled_token_ids.get_device();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
-
-  int blocks;
-  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-
-  advance_step_flashattn_kernel<max_threads>
-      <<<blocks, max_threads, 0, stream>>>(
-          num_seqs, num_queries, block_size,
-          reinterpret_cast<long*>(input_tokens.data_ptr()),
-          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
-          reinterpret_cast<long*>(input_positions.data_ptr()),
-          reinterpret_cast<int*>(seq_lens.data_ptr()),
-          reinterpret_cast<long*>(slot_mapping.data_ptr()),
-          reinterpret_cast<int const*>(block_tables.data_ptr()),
-          block_tables.stride(0));
-}
-
-void advance_step_flashinfer(
-    int num_seqs, int num_queries, int block_size,
-    torch::Tensor& input_tokens,            // type: long
-    torch::Tensor& sampled_token_ids,       // type: long
-    torch::Tensor& input_positions,         // type: long
-    torch::Tensor& seq_lens,                // type: int
-    torch::Tensor& slot_mapping,            // type: long
-    torch::Tensor& block_tables,            // type: int
-    torch::Tensor& paged_kv_indices,        // type: int
-    torch::Tensor& paged_kv_indptr,         // type: int
-    torch::Tensor& paged_kv_last_page_len,  // type: int
-    torch::Tensor& block_table_bound) {     // type: int
-
-  if (logging) {
-    printf("advance_step_flashinfer:\n");
-    printf("  num_seqs = %d\n", num_seqs);
-    printf("  num_queries = %d\n", num_queries);
-    printf("  block_size = %d\n", block_size);
-    printf("  block_tables.stride(0) = %zu\n", block_tables.stride(0));
-  }
-  // Verify all tensors
-  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
-  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
-  //               at::kLong);
-  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
-  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
-  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
-  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
-
-  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
-  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
-  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
-                at::kInt);
-
-  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
-
-  int dev = sampled_token_ids.get_device();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
-
-  int blocks;
-  int threads;
-  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
-
-  TORCH_CHECK((blocks * threads > num_queries),
-              "multi-step: not enough threads to map to num_queries = ",
-              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
-              " blocks = ", blocks, " max_threads = ", threads);
-  if (logging) {
-    printf("launching kernels with %d blocks and %d threads\n", blocks,
-           threads);
-  }
-  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
-      threads, num_seqs, num_queries, block_size,
-      reinterpret_cast<long*>(input_tokens.data_ptr()),
-      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
-      reinterpret_cast<long*>(input_positions.data_ptr()),
-      reinterpret_cast<int*>(seq_lens.data_ptr()),
-      reinterpret_cast<long*>(slot_mapping.data_ptr()),
-      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0),
-      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-
-  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
-      threads, num_seqs, num_queries,
-      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-
-  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
-      num_seqs, num_queries,
-      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0),
-      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
-      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-}
-
-}  // namespace prepare_inputs
-
-void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                            int64_t block_size, torch::Tensor& input_tokens,
-                            torch::Tensor& sampled_token_ids,
-                            torch::Tensor& input_positions,
-                            torch::Tensor& seq_lens,
-                            torch::Tensor& slot_mapping,
-                            torch::Tensor& block_tables) {
-  prepare_inputs::advance_step_flashattn(
-      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
-      input_positions, seq_lens, slot_mapping, block_tables);
-}
-
-void advance_step_flashinfer(
-    int64_t num_seqs, int64_t num_queries, int64_t block_size,
-    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-    torch::Tensor& input_positions, torch::Tensor& seq_lens,
-    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
-    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
-    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
-  prepare_inputs::advance_step_flashinfer(
-      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
-      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
-      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
-}
diff --git a/csrc/prepare_inputs/advance_step.cuh b/csrc/prepare_inputs/advance_step.cuh
deleted file mode 100644
index f21574681b1a..000000000000
--- a/csrc/prepare_inputs/advance_step.cuh
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <iostream>
-
-namespace prepare_inputs {
-
-static constexpr int max_threads = 256;
-static constexpr bool logging = false;
-
-constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
-
-}  // namespace prepare_inputs
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8c207be083d8..a547baec50d6 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -142,25 +142,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
 
-  // prepare_inputs advance_step
-  ops.def(
-      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
-      "Tensor! input_tokens, Tensor sampled_token_ids, "
-      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
-      "Tensor block_tables) -> ()");
-  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
-
-  ops.def(
-      "advance_step_flashinfer("
-      "    int num_seqs, int num_queries, int block_size,"
-      "    Tensor! input_tokens, Tensor sampled_token_ids,"
-      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
-      "    Tensor block_tables, Tensor! paged_kv_indices,"
-      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
-      "    Tensor! block_table_bounds"
-      ") -> ()");
-  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
-
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a020b171e894..a318637c5aeb 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -319,38 +319,6 @@ def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor,
                                          repetition_penalties)
 
 
-def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
-                           input_tokens: torch.Tensor,
-                           sampled_token_ids: torch.Tensor,
-                           input_positions: torch.Tensor,
-                           seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
-                           block_tables: torch.Tensor) -> None:
-    """Advance a step on GPU for existing inputs for a multi-step runner"""
-    return torch.ops._C.advance_step_flashattn(num_seqs, num_queries,
-                                               block_size, input_tokens,
-                                               sampled_token_ids,
-                                               input_positions, seq_lens,
-                                               slot_mapping, block_tables)
-
-
-def advance_step_flashinfer(num_seqs: int, num_queries: int, block_size: int,
-                            input_tokens: torch.Tensor,
-                            sampled_token_ids: torch.Tensor,
-                            input_positions: torch.Tensor,
-                            seq_lens: torch.Tensor, slot_mapping: torch.Tensor,
-                            block_tables: torch.Tensor,
-                            paged_kv_indices: torch.Tensor,
-                            paged_kv_indptr: torch.Tensor,
-                            paged_kv_last_page_len: torch.Tensor,
-                            block_table_bound: torch.Tensor) -> None:
-
-    return torch.ops._C.advance_step_flashinfer(
-        num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
-        input_positions, seq_lens, slot_mapping, block_tables,
-        paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len,
-        block_table_bound)
-
-
 # fused quant layer norm ops
 def rms_norm_dynamic_per_token_quant(
     input: torch.Tensor,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 2417fe06a675..d21f07756871 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -101,11 +101,6 @@ def copy_blocks(
     ) -> None:
         raise NotImplementedError
 
-    def advance_step(self, model_input: "ModelRunnerInputBase",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int, num_seqs: int, num_queries: int) -> None:
-        raise NotImplementedError
-
     @classmethod
     def full_cls_name(cls) -> tuple[str, str]:
         return (cls.__module__, cls.__qualname__)
diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
index bd9bc427728d..fac3c318a87a 100644
--- a/vllm/attention/backends/differential_flash_attn.py
+++ b/vllm/attention/backends/differential_flash_attn.py
@@ -35,8 +35,7 @@
                                   flash_attn_with_kvcache)
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 logger = init_logger(__name__)
 
@@ -326,79 +325,6 @@ def decode_metadata(
             cross_block_tables=self.cross_block_tables)
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
 
 class DifferentialFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[DifferentialFlashAttentionMetadata]):
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ee36fd19e012..e52480d5c5ce 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -32,8 +32,7 @@
                                   flash_attn_with_kvcache)
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 logger = init_logger(__name__)
 
@@ -309,79 +308,6 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
             cross_block_tables=self.cross_block_tables)
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 78d8a67e37f8..208cacec38eb 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -51,8 +51,7 @@
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 
 class FlashInferBackend(AttentionBackend):
@@ -428,7 +427,7 @@ class FlashInferMetadata(AttentionMetadata):
     query_start_loc: Optional[torch.Tensor] = None
     block_tables: Optional[torch.Tensor] = None
 
-    # used for GPU in-place advance_step
+    # used for GPU operations
     seq_lens_tensor: Optional[torch.Tensor] = None
     block_table_bound: Optional[torch.Tensor] = None
 
@@ -587,66 +586,6 @@ def decode_metadata(self) -> Optional["FlashInferMetadata"]:
             return None
         return self
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            # Flashinfer doesn't support speculative decoding + chunked-prefill
-            # + multi-step scheduling yet.
-            assert self.decode_query_len == 1
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens_tensor is not None
-
-        assert num_seqs > 0
-        assert num_queries > 0
-        assert model_input.attn_metadata is not None
-        assert sampled_token_ids is not None
-
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
-
-        # Update GPU tensors
-        ops.advance_step_flashinfer(
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            block_size=block_size,
-            input_tokens=model_input.input_tokens,
-            sampled_token_ids=model_input.input_tokens,
-            input_positions=model_input.input_positions,
-            seq_lens=self.seq_lens_tensor,
-            slot_mapping=self.slot_mapping,
-            block_tables=self.block_tables,
-            paged_kv_indices=self.paged_kv_indices,
-            paged_kv_indptr=self.paged_kv_indptr,
-            paged_kv_last_page_len=self.paged_kv_last_page_len,
-            block_table_bound=self.block_table_bound)
-
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index a242ac9bbe0b..f23c096952ce 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -3,7 +3,7 @@
 
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 
 import torch
 
@@ -18,9 +18,6 @@
                                          get_mla_metadata,
                                          is_flashmla_supported)
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
 
 class FlashMLABackend(MLACommonBackend):
 
@@ -62,16 +59,6 @@ def decode_metadata(self):
                 self.decode_num_splits
         return decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        raise NotImplementedError(
-            "advance_step is not implemented for FlashMLA")
-
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
 
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 52c4a9e7da3d..8ff7f5674323 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -234,8 +234,7 @@
         flash_attn_varlen_func = None
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 is_hip = current_platform.is_rocm()
 
@@ -631,90 +630,6 @@ def decode_metadata(self):
             is_profile_run=self.is_profile_run)
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        self._ops_advance_step(num_seqs=num_seqs,
-                               num_queries=num_queries,
-                               block_size=block_size,
-                               input_tokens=model_input.input_tokens,
-                               sampled_token_ids=sampled_token_ids,
-                               input_positions=model_input.input_positions)
-
-    def _ops_advance_step(self, num_seqs: int, num_queries: int,
-                          block_size: int, input_tokens: torch.Tensor,
-                          sampled_token_ids: torch.Tensor,
-                          input_positions: torch.Tensor) -> None:
-        # here we use advance_step_flashinfo to update the paged_kv_* tensors
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
 
 class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
     """
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 820ddcab77d7..e630a6c6de8c 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -15,8 +15,7 @@
 from vllm.multimodal import MultiModalPlaceholderMap
 
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder)
 from vllm.utils import async_tensor_h2d
 
 # Placeholder attention backend for models like Mamba and pooling models that
@@ -201,65 +200,6 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
         )
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        assert not turn_prefills_into_decodes, \
-            ("Multi-Step + Chunked-Prefill is not supported for attention-free"
-             "models. turn_prefills_into_decodes is a "
-             "Multi-Step + Chunked-Prefill specific parameter.")
-
-        assert self.seq_lens is not None
-        assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        # Update sequences, masking off entries greater than num_queries
-        device = self.seq_lens_tensor.device
-        mask = torch.arange(self.seq_lens_tensor.size(0),
-                            device=device) < num_queries
-        self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype)
-        if sampled_token_ids is not None:
-            model_input.input_tokens.masked_scatter_(
-                mask, sampled_token_ids[:num_queries])
-
 
 class PlaceholderAttentionMetadataBuilder(
         AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py
index a165a786d63d..a2e9710437d9 100644
--- a/vllm/attention/backends/rocm_aiter_mla.py
+++ b/vllm/attention/backends/rocm_aiter_mla.py
@@ -7,7 +7,6 @@
 
 import torch
 
-import vllm._custom_ops as ops
 import vllm.envs as envs
 from vllm.attention.backends.mla.common import (MLACommonBackend,
                                                 MLACommonImpl,
@@ -107,26 +106,6 @@ def decode_metadata(self):
 
         return self._cached_decode_metadata
 
-    def _ops_advance_step(self, num_seqs: int, num_queries: int,
-                          block_size: int, input_tokens: torch.Tensor,
-                          sampled_token_ids: torch.Tensor,
-                          input_positions: torch.Tensor) -> None:
-
-        ops.advance_step_flashinfer(
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            block_size=block_size,
-            input_tokens=input_tokens,
-            sampled_token_ids=sampled_token_ids,
-            input_positions=input_positions,
-            seq_lens=self.seq_lens_tensor,
-            slot_mapping=self.slot_mapping,
-            block_tables=self.block_tables,
-            paged_kv_indices=self.paged_kv_indices,
-            paged_kv_indptr=self.paged_kv_indptr,
-            paged_kv_last_page_lens=self.paged_kv_last_page_lens,
-            block_table_bound=self.block_table_bound)
-
 
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     BLOCK_TABLE_EXTENDER: list[list[int]] = [[]]
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index da3d9ff32830..63e467f5a7a2 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -4,7 +4,7 @@
 import itertools
 from dataclasses import dataclass
 from functools import cache
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 
 import torch
 
@@ -23,9 +23,6 @@
     GroupShape)
 from vllm.platforms import current_platform
 
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-
 logger = init_logger(__name__)
 _PARTITION_SIZE_ROCM = 256
 
@@ -261,69 +258,6 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             self._cached_decode_metadata.query_start_loc = qs - qs[0]
         return self._cached_decode_metadata
 
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-
-        assert not turn_prefills_into_decodes, \
-            ("Chunked prefill is not supported with rocm_flash_attn yet."
-             "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill "
-             "specific parameter.")
-
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-        assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
 
 class ROCmFlashAttentionMetadataBuilder(
         CommonMetadataBuilder[ROCmFlashAttentionMetadata]):
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a63797e3a46a..a1c08fa814db 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -762,8 +762,7 @@ def _get_cuda_graph_pad_size(self,
         has Prefills (if any). The rest of the steps are guaranteed to be all
         decodes. In this case, we set up the padding as if all the sequences
         are decodes so we may run all steps except the first step in CUDA graph
-        mode. The padding is accounted for in the multi-step `advance_step`
-        family of functions.
+        mode.
 
         Args:
             num_seqs (int): Number of sequences scheduled to run.

From 9fb722390747ff83648c0f6befe2cb2a696af7a9 Mon Sep 17 00:00:00 2001
From: sstamenk <sstamenk@amd.com>
Date: Fri, 15 Aug 2025 19:17:31 +0200
Subject: [PATCH 098/233] [BugFix] Skip the Q component for QKVParallelLinear
 in the case of QKVCrossParallelLinear since its width is 0 (#22369)

Signed-off-by: sstamenk <sstamenk@amd.com>
---
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index ddb50968904d..659029fd37f7 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -121,6 +121,9 @@ def requantize_with_max_scale(
     if unfused_module_in_checkpoint:
         start = 0
         for idx, logical_width in enumerate(logical_widths):
+            # Skip any component with zero width.
+            if logical_width == 0:
+                continue
             end = start + logical_width
             weight_dq = per_tensor_dequantize(weight[start:end, :],
                                               weight_scale[idx])

From c4671db1f72da87f835a6485202814e92f5b461a Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Fri, 15 Aug 2025 19:42:49 +0200
Subject: [PATCH 099/233] [FIXBUG] Correctly Apply Grammar Bitmask in Mixed
 Batches (#22896)

Signed-off-by: JartX <sagformas@epdcenter.es>
---
 vllm/v1/worker/gpu_model_runner.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9460d91c5832..3ea39dc519d8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1337,9 +1337,10 @@ def apply_grammar_bitmask(
         out_indices = []
 
         # Reorder the bitmask to match the order of the requests in the batch.
-        sorted_bitmask = np.zeros_like(grammar_bitmask,
-                                       shape=(logits.shape[0],
-                                              grammar_bitmask.shape[1]))
+        sorted_bitmask = np.full(shape=(logits.shape[0],
+                                        grammar_bitmask.shape[1]),
+                                 fill_value=-1,
+                                 dtype=grammar_bitmask.dtype)
         cumulative_index = 0
         seq = sorted(scheduler_output.structured_output_request_ids.items(),
                      key=lambda x: x[1])

From 23d917bd6046a0c12b4706ddee5650825fafc296 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Fri, 15 Aug 2025 11:23:06 -0700
Subject: [PATCH 100/233] [Benchmarks] Include image data when ShareGPT4V
 dataset is used. (#22955)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 benchmarks/README.md            | 49 +++++++++++++++++++++++++++++++++
 benchmarks/benchmark_dataset.py |  8 +++++-
 vllm/benchmarks/datasets.py     |  8 +++++-
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index d6442a4fc387..caff8f034214 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -22,6 +22,17 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
     </tr>
+    <tr>
+      <td><strong>ShareGPT4V (Image)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
+        <br>
+        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
+        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
+      </td>
+    </tr>
     <tr>
       <td><strong>BurstGPT</strong></td>
       <td style="text-align: center;">✅</td>
@@ -616,3 +627,41 @@ python3 benchmarks/benchmark_prioritization.py \
 ```
 
 </details>
+
+## 👁️ Example - Multi-Modal Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of multi-modal requests in vLLM.
+
+### Images (ShareGPT4V)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"image": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4v/images
+```
+
+Send requests with images:
+
+```bash
+python benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+</details>
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index ea684f18a742..572292a5aca4 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -430,14 +430,20 @@ def sample(
                 skip_min_output_len_check=output_len is not None,
             ):
                 continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"):
+                mm_content = process_image(image_path)
+            else:
+                mm_content = None
             if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
             samples.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
+                    multi_modal_data=mm_content,
                 )
             )
         self.maybe_oversample_requests(samples, num_requests)
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 4e8ac5162542..5299dcf54b39 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -454,15 +454,21 @@ def sample(
                                      skip_min_output_len_check=output_len
                                      is not None):
                 continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"): 
+                mm_content = process_image(image_path) 
+            else: 
+                mm_content = None
             if enable_multimodal_chat:
                 prompt = self.apply_multimodal_chat_transformation(
-                    prompt, None)
+                    prompt, mm_content)
             samples.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
+                    multi_modal_data=mm_content,
                 ))
         self.maybe_oversample_requests(samples, num_requests)
         return samples

From f2535c9e9654a3c114954f54173b08425dc9ba5a Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Sat, 16 Aug 2025 02:29:25 +0800
Subject: [PATCH 101/233] [Structured Output] Make the output of structured
 output example more complete (#22481)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 examples/offline_inference/structured_outputs.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 8ef121ebe848..f46064931dba 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -15,6 +15,8 @@
 from vllm import LLM, SamplingParams
 from vllm.sampling_params import GuidedDecodingParams
 
+MAX_TOKENS = 50
+
 # Guided decoding by Choice (list of possible options)
 guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
 sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
@@ -23,7 +25,9 @@
 # Guided decoding by Regex
 guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
 sampling_params_regex = SamplingParams(
-    guided_decoding=guided_decoding_params_regex, stop=["\n"]
+    guided_decoding=guided_decoding_params_regex,
+    stop=["\n"],
+    max_tokens=MAX_TOKENS,
 )
 prompt_regex = (
     "Generate an email address for Alan Turing, who works in Enigma."
@@ -48,7 +52,10 @@ class CarDescription(BaseModel):
 
 json_schema = CarDescription.model_json_schema()
 guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
-sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json)
+sampling_params_json = SamplingParams(
+    guided_decoding=guided_decoding_params_json,
+    max_tokens=MAX_TOKENS,
+)
 prompt_json = (
     "Generate a JSON with the brand, model and car_type of"
     "the most iconic car from the 90's"
@@ -64,7 +71,10 @@ class CarDescription(BaseModel):
 number ::= "1 " | "2 "
 """
 guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
-sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar)
+sampling_params_grammar = SamplingParams(
+    guided_decoding=guided_decoding_params_grammar,
+    max_tokens=MAX_TOKENS,
+)
 prompt_grammar = (
     "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
 )

From 6a8c9b16e2fedcd767d741f5d9324cf7a8e8c0f5 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:46:00 -0400
Subject: [PATCH 102/233] [Kernels] Clean up FusedMoeMethodBase and modular
 kernel setup.  Remove extra arguments from modular kernel methods. (#22035)

Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 docs/design/fused_moe_modular_kernel.md       |  10 +-
 examples/offline_inference/data_parallel.py   |  23 +-
 .../moe/modular_kernel_tools/common.py        | 540 +++++++++---------
 .../moe/modular_kernel_tools/mk_objects.py    | 461 ++++++++++++++-
 .../profile_modular_kernel.py                 |   4 +-
 .../kernels/moe/modular_kernel_tools/utils.py | 117 ----
 tests/kernels/moe/test_batched_moe.py         |   4 +-
 tests/kernels/moe/test_block_fp8.py           |  31 +-
 tests/kernels/moe/test_block_int8.py          |  15 +-
 .../kernels/moe/test_cutlass_grouped_gemm.py  |  17 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |   6 +-
 tests/kernels/moe/test_deepgemm.py            |   6 +-
 tests/kernels/moe/test_flashinfer_moe.py      | 147 +++++
 .../moe/test_modular_kernel_combinations.py   | 141 +++--
 tests/kernels/moe/test_nvfp4_moe.py           |  64 +--
 tests/kernels/moe/test_pplx_cutlass_moe.py    |  11 +-
 tests/kernels/moe/test_pplx_moe.py            |   4 +-
 tests/kernels/moe/utils.py                    |  75 ++-
 .../base_device_communicator.py               |   7 +-
 .../layers/fused_moe/__init__.py              |   4 +-
 .../layers/fused_moe/batched_deep_gemm_moe.py |  36 +-
 .../batched_triton_or_deep_gemm_moe.py        |  38 +-
 .../model_executor/layers/fused_moe/config.py |  11 +-
 .../layers/fused_moe/cutlass_moe.py           | 328 ++++++-----
 .../layers/fused_moe/deep_gemm_moe.py         |   3 +-
 .../fused_moe/deepep_ht_prepare_finalize.py   |  30 +-
 .../fused_moe/deepep_ll_prepare_finalize.py   |  32 +-
 .../fused_moe/flashinfer_cutlass_moe.py       |  59 +-
 .../flashinfer_cutlass_prepare_finalize.py    |  52 +-
 .../layers/fused_moe/fused_batched_moe.py     |  98 ++--
 .../layers/fused_moe/fused_moe.py             |   7 +-
 .../fused_moe/gpt_oss_triton_kernels_moe.py   |  15 +-
 vllm/model_executor/layers/fused_moe/layer.py |  91 +--
 .../layers/fused_moe/modular_kernel.py        | 117 ++--
 .../layers/fused_moe/pplx_prepare_finalize.py |  33 +-
 .../layers/fused_moe/prepare_finalize.py      |  43 +-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  37 +-
 vllm/model_executor/layers/fused_moe/utils.py |  18 +-
 .../layers/quantization/auto_round.py         |   4 +-
 .../model_executor/layers/quantization/awq.py |   2 +-
 .../layers/quantization/awq_marlin.py         |  18 +-
 .../layers/quantization/bitsandbytes.py       |  12 +-
 .../compressed_tensors_moe.py                 | 168 ++++--
 .../layers/quantization/experts_int8.py       |  17 +-
 .../model_executor/layers/quantization/fp8.py |  43 +-
 .../layers/quantization/gguf.py               |  15 +-
 .../layers/quantization/gptq_marlin.py        |  14 +-
 .../layers/quantization/modelopt.py           |  99 ++--
 .../layers/quantization/moe_wna16.py          |  16 +-
 .../layers/quantization/mxfp4.py              |   2 +-
 .../layers/quantization/quark/quark_moe.py    |  39 +-
 .../model_executor/layers/quantization/rtn.py |  13 +-
 .../quantization/utils/flashinfer_fp4_moe.py  | 129 +----
 54 files changed, 2022 insertions(+), 1305 deletions(-)
 delete mode 100644 tests/kernels/moe/modular_kernel_tools/utils.py
 create mode 100644 tests/kernels/moe/test_flashinfer_moe.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 04d7cdc3d885..87296a08e207 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -399,6 +399,7 @@ steps:
 - label: Kernels MoE Test %N
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
   - csrc/moe/
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 3ef1232051b0..4b917ab408ee 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -175,11 +175,19 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking
 
 ### FusedMoEModularKernel Initialization
 
-`FusedMoEMethodBase` class has 2 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are,
+`FusedMoEMethodBase` class has 3 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are,
 
+* maybe_make_prepare_finalize,
 * select_gemm_impl, and
 * init_prepare_finalize
 
+#### maybe_make_prepare_finalize
+
+The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
+Please refer to the implementations in,
+
+* `ModelOptNvFp4FusedMoE`
+
 #### select_gemm_impl
 
 The `select_gemm_impl` method is undefined in the base class. It is the responsibility of the derived class to implement a method that constructs a valid/appropriate `FusedMoEPermuteExpertsUnpermute` object.
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index dbf8ed58cc47..dd7559451c4c 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -70,12 +70,27 @@ def parse_args():
         default=64,
         help=("Maximum number of sequences to be processed in a single iteration."),
     )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        help=("Maximum number of tokens to be processed in a single iteration."),
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=300,
+        help=("Number of seconds before unresponsive process is killed."),
+    )
     parser.add_argument(
         "--gpu-memory-utilization",
         type=float,
         default=0.8,
         help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
     )
+    parser.add_argument(
+        "--quantization",
+        type=str,
+    )
     return parser.parse_args()
 
 
@@ -90,7 +105,9 @@ def main(
     enforce_eager,
     trust_remote_code,
     max_num_seqs,
+    max_model_len,
     gpu_memory_utilization,
+    quantization,
 ):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@@ -142,7 +159,9 @@ def start(rank):
         enable_expert_parallel=True,
         trust_remote_code=trust_remote_code,
         max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
         gpu_memory_utilization=gpu_memory_utilization,
+        quantization=quantization,
     )
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
@@ -198,14 +217,16 @@ def start(rank):
                 args.enforce_eager,
                 args.trust_remote_code,
                 args.max_num_seqs,
+                args.max_model_len,
                 args.gpu_memory_utilization,
+                args.quantization,
             ),
         )
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join(timeout=300)
+        proc.join(timeout=args.timeout)
         if proc.exitcode is None:
             print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
             proc.kill()
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index fd99e8dc5c98..a10666b6ec9a 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -7,41 +7,22 @@
 
 import vllm._custom_ops as ops
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_test_weights, per_token_cast_to_fp8
+from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
+                                                    FLOAT8_E4M3_MAX,
+                                                    dequantize_nvfp4_to_dtype)
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig
 from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
-# Fused experts and PrepareFinalize imports
-from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    BatchedDeepGemmExperts)
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig)
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
-from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts, NaiveBatchedExperts)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
-from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
-                                                        TritonExperts)
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP)
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
 
+from .mk_objects import (expert_info, make_fused_experts,
+                         make_prepare_finalize, prepare_finalize_info)
 from .parallel_utils import ProcessGroupInfo
-from .utils import (make_block_quant_fp8_weights, make_non_quant_weights,
-                    make_quant_fp8_weights, per_token_cast_to_fp8)
-
-if has_pplx():
-    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
-if has_deep_ep():
-    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
-        DeepEPHTPrepareAndFinalize)
-    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
-        DeepEPLLPrepareAndFinalize)
 
 
 def _describe_tensor(t: Optional[torch.Tensor], name: str) -> str:
@@ -69,24 +50,31 @@ class Config:
 
     torch_trace_dir_path: Optional[str] = None
 
+    def __post_init__(self):
+        if self.quant_config is None:
+            self.quant_config = FusedMoEQuantConfig()
+
     def describe(self) -> str:
         s = ""
-        s += "== Config: \n"
-        s += f" world_size={self.world_size} \n"
-        s += f" PF={self.prepare_finalize_type.__name__} \n"
-        s += f" FE={self.fused_experts_type.__name__} \n"
-        s += f" topk={self.topks} \n"
-        s += f" dtype={self.dtype} \n"
-        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n"
-        s += " Quant: \n"
-        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size} \n "
+        s += "== Config:\n"
+        s += f" world_size={self.world_size}\n"
+        s += f" PF={self.prepare_finalize_type.__name__}\n"
+        s += f" FE={self.fused_experts_type.__name__}\n"
+        s += f" E={self.E}\n"
+        s += f" Ms={self.Ms}\n"
+        s += f" N={self.N}\n"
+        s += f" K={self.K}\n"
+        s += f" topk={self.topks}\n"
+        s += f" dtype={self.dtype}\n"
+        s += f" fused_moe_chunk_size={self.fused_moe_chunk_size}\n"
+        s += " Quant:\n"
         if self.quant_config is not None:
-            s += f"     q_dtype={self.quant_dtype} \n"
-            s += f"     q_block_shape={self.quant_block_shape} \n"
-            s += f"     q_per_out_ch_quant={self.is_per_out_ch_quant} \n"
-            s += f"     q_per_act_token={self.is_per_act_token_quant} \n"
+            s += f"     q_dtype={self.quant_dtype}\n"
+            s += f"     q_block_shape={self.quant_block_shape}\n"
+            s += f"     q_per_out_ch_quant={self.is_per_out_ch_quant}\n"
+            s += f"     q_per_act_token={self.is_per_act_token_quant}\n"
         else:
-            s += "     quant=None \n"
+            s += "     quant=None\n"
         return s
 
     @property
@@ -95,34 +83,28 @@ def M(self) -> int:
         return self.Ms
 
     @property
-    def quant_dtype(self) -> Optional[torch.dtype]:
-        if self.quant_config is None:
-            return None
+    def quant_dtype(self) -> Union[torch.dtype, str, None]:
+        assert self.quant_config is not None
         return self.quant_config.quant_dtype
 
     @property
     def is_per_act_token_quant(self) -> bool:
-        if self.quant_config is None:
-            return False
+        assert self.quant_config is not None
         return self.quant_config.per_act_token_quant
 
     @property
     def is_per_tensor_act_quant(self) -> bool:
-        if self.quant_config is None:
-            return False
         return (not self.is_per_act_token_quant
                 and self.quant_block_shape is None)
 
     @property
     def is_per_out_ch_quant(self) -> bool:
-        if self.quant_config is None:
-            return False
+        assert self.quant_config is not None
         return self.quant_config.per_out_ch_quant
 
     @property
     def quant_block_shape(self) -> Optional[list[int]]:
-        if self.quant_config is None:
-            return None
+        assert self.quant_config is not None
         return self.quant_config.block_shape
 
     @property
@@ -130,36 +112,30 @@ def topk(self) -> int:
         assert isinstance(self.topks, int)
         return self.topks
 
-    @property
-    def topk_ids_dtype(self) -> Optional[torch.dtype]:
-        topk_ids_dtype = None
-        if self.prepare_finalize_type == PplxPrepareAndFinalize:
-            topk_ids_dtype = torch.uint32
-        elif self.prepare_finalize_type in [
-                DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
-        ]:
-            topk_ids_dtype = torch.int64
-        return topk_ids_dtype
-
     @property
     def num_local_experts(self) -> int:
         return self.E // self.world_size
 
     def make_env_data(self) -> tuple[VllmConfig, dict[Any, Any]]:
         """
-        make env data for vllm launch. 
+        make env data for vllm launch.
         """
         vllm_config = VllmConfig()
         vllm_config.parallel_config.data_parallel_size = self.world_size
         vllm_config.parallel_config.enable_expert_parallel = True
 
         env_dict = {
-            "VLLM_ALL2ALL_BACKEND": self.all2all_backend(),
             "VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())),
         }
+
+        backend = self.all2all_backend()
+        if backend is not None:
+            env_dict.update({"VLLM_ALL2ALL_BACKEND": backend})
+
         if self.fused_moe_chunk_size is not None:
             env_dict.update(
                 {"VLLM_FUSED_MOE_CHUNK_SIZE": str(self.fused_moe_chunk_size)})
+
         return vllm_config, env_dict
 
     def is_fp8_block_quantized(self):
@@ -167,85 +143,59 @@ def is_fp8_block_quantized(self):
                 and self.quant_block_shape is not None)
 
     def is_batched_prepare_finalize(self):
-        return self.prepare_finalize_type in [
-            PplxPrepareAndFinalize, DeepEPLLPrepareAndFinalize
-        ]
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return (mk.FusedMoEActivationFormat.BatchedExperts ==
+                info.activation_format)
 
     def is_batched_fused_experts(self):
-        return self.fused_experts_type in [
-            CutlassExpertsFp8, BatchedDeepGemmExperts, BatchedTritonExperts,
-            NaiveBatchedExperts, BatchedTritonOrDeepGemmExperts
-        ]
+        info = expert_info(self.fused_experts_type)
+        return (mk.FusedMoEActivationFormat.BatchedExperts ==
+                info.activation_format)
 
     def is_standard_fused_experts(self):
-        return self.fused_experts_type in [
-            CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts,
-            TritonExperts
-        ]
-
-    def is_fe_16bit_supported(self):
-        return self.fused_experts_type in [
-            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
-            NaiveBatchedExperts, TritonExperts
-        ]
-
-    def is_fe_fp8_supported(self):
-        return self.fused_experts_type in [
-            BatchedDeepGemmExperts,
-            BatchedTritonExperts,
-            BatchedTritonOrDeepGemmExperts,
-            CutlassExpertsFp8,
-            DeepGemmExperts,
-            TritonExperts,
-            TritonOrDeepGemmExperts,
-            NaiveBatchedExperts,
-        ]
-
-    def is_fe_block_fp8_supported(self):
-        return self.fused_experts_type in [
-            BatchedDeepGemmExperts,
-            BatchedTritonOrDeepGemmExperts,
-            DeepGemmExperts,
-            TritonExperts,
-            TritonOrDeepGemmExperts,
-            BatchedTritonExperts,
-            NaiveBatchedExperts,
-        ]
+        info = expert_info(self.fused_experts_type)
+        return mk.FusedMoEActivationFormat.Standard == info.activation_format
+
+    def fe_supported_types(self):
+        info = expert_info(self.fused_experts_type)
+        return info.supported_dtypes
+
+    def pf_supported_types(self):
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.supported_dtypes
+
+    def is_block_quant_supported(self):
+        info = expert_info(self.fused_experts_type)
+        return info.blocked_quantization_support
 
     def is_fe_supports_chunking(self):
-        return self.fused_experts_type in [
-            CutlassExpertsFp8, DeepGemmExperts, TritonOrDeepGemmExperts,
-            TritonExperts
-        ]
+        info = expert_info(self.fused_experts_type)
+        return info.supports_chunking
+
+    def supports_expert_map(self):
+        info = expert_info(self.fused_experts_type)
+        return info.supports_expert_map
+
+    def supports_apply_weight_on_input(self):
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.supports_apply_weight_on_input
 
     def needs_deep_gemm(self):
-        return self.fused_experts_type in [
-            BatchedDeepGemmExperts,
-            DeepGemmExperts,
-        ]
+        info = expert_info(self.fused_experts_type)
+        return info.needs_deep_gemm
 
     def needs_pplx(self):
-        return self.prepare_finalize_type in [PplxPrepareAndFinalize]
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.backend == "pplx"
 
     def needs_deep_ep(self):
-        return self.prepare_finalize_type in [
-            DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
-        ]
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return (info.backend == "deepep_high_throughput"
+                or info.backend == "deepep_low_latency")
 
     def all2all_backend(self):
-        if self.needs_pplx():
-            return "pplx"
-        if self.prepare_finalize_type == DeepEPHTPrepareAndFinalize:
-            return "deepep_high_throughput"
-        if self.prepare_finalize_type == DeepEPLLPrepareAndFinalize:
-            return "deepep_low_latency"
-        return "naive"
-
-    def needs_all2all(self):
-        return self.prepare_finalize_type in [
-            PplxPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
-            DeepEPLLPrepareAndFinalize
-        ]
+        info = prepare_finalize_info(self.prepare_finalize_type)
+        return info.backend
 
     def is_valid(self):
         # Check prepare-finalize and fused-experts compatibility
@@ -267,28 +217,28 @@ def is_valid(self):
             # invalid quant config
             return False
 
-        # check bf16 / fp16 support
-        is_16bit = (self.dtype.itemsize == 2 and self.quant_dtype is None)
-        if is_16bit and not self.is_fe_16bit_supported():
-            return False
-
-        # Check fp8 support
-        is_fp8 = self.quant_dtype == torch.float8_e4m3fn
-        if is_fp8 and not self.is_fe_fp8_supported():
-            return False
+        # check type support
+        if self.quant_dtype is None:
+            if (self.dtype not in self.pf_supported_types()
+                    or self.dtype not in self.fe_supported_types()):
+                return False
+        else:
+            if (self.quant_dtype not in self.pf_supported_types()
+                    or self.quant_dtype not in self.fe_supported_types()):
+                return False
 
-        # Check fp8 block quanization support
+        # Check block quanization support
         is_block_quatized = self.quant_block_shape is not None
-        if is_block_quatized and not is_fp8:
+        if is_block_quatized and self.quant_dtype is None:
             return False
-        if is_block_quatized and not self.is_fe_block_fp8_supported():
+        if is_block_quatized and not self.is_block_quant_supported():
             return False
 
         # deep_gemm only works with block-quantized
         if self.needs_deep_gemm() and not is_block_quatized:
             return False
 
-        # Check dependencies
+        # Check dependencies (turn into asserts?)
         if self.needs_deep_ep() and not has_deep_ep():
             return False
         if self.needs_deep_gemm() and not has_deep_gemm():
@@ -305,6 +255,8 @@ class WeightTensors:
     w2: torch.Tensor
     w1_scale: Optional[torch.Tensor]
     w2_scale: Optional[torch.Tensor]
+    w1_gs: Optional[torch.Tensor] = None
+    w2_gs: Optional[torch.Tensor] = None
 
     def describe(self):
         s = ""
@@ -313,13 +265,20 @@ def describe(self):
         s += f' - {_describe_tensor(self.w2, "w2")} \n'
         s += f' - {_describe_tensor(self.w1_scale, "w1_scale")} \n'
         s += f' - {_describe_tensor(self.w2_scale, "w2_scale")} \n'
+        s += f' - {_describe_tensor(self.w1_gs, "w1_gs")} \n'
+        s += f' - {_describe_tensor(self.w2_gs, "w2_gs")} \n'
         return s
 
+    def is_quantized(self) -> bool:
+        # or w1_scale is not None?
+        return (self.w1.dtype == torch.float8_e4m3fn
+                or self.w1.dtype == torch.uint8 or self.w1.dtype == torch.int8)
+
     def to_current_device(self):
         self.w1 = self.w1.to(device=torch.cuda.current_device())
         self.w2 = self.w2.to(device=torch.cuda.current_device())
-        is_quantized = self.w1.dtype == torch.float8_e4m3fn
-        if is_quantized:
+
+        if self.is_quantized():
             assert self.w1_scale is not None
             assert self.w2_scale is not None
             self.w1_scale = self.w1_scale.to(
@@ -327,56 +286,51 @@ def to_current_device(self):
             self.w2_scale = self.w2_scale.to(
                 device=torch.cuda.current_device())
 
+        if self.w1_gs is not None:
+            assert self.w2_gs is not None
+            self.w1_gs = self.w1_gs.to(device=torch.cuda.current_device())
+            self.w2_gs = self.w2_gs.to(device=torch.cuda.current_device())
+
     def slice_weights(self, rank: int,
                       num_local_experts: int) -> "WeightTensors":
         s = rank * num_local_experts
         e = s + num_local_experts
         w1 = self.w1[s:e, :, :]
         w2 = self.w2[s:e, :, :]
-        is_quantized = self.w1.dtype == torch.float8_e4m3fn
+
         w1_scale, w2_scale = (None, None)
-        if is_quantized:
+        if self.is_quantized():
             assert self.w1_scale is not None
             assert self.w2_scale is not None
             w1_scale = self.w1_scale[s:e, :, :]
             w2_scale = self.w2_scale[s:e, :, :]
-        return WeightTensors(w1, w2, w1_scale, w2_scale)
 
-    @staticmethod
-    def make(config: Config) -> "WeightTensors":
+        w1_gs = self.w1_gs
+        w2_gs = self.w2_gs
+        if w1_gs is not None:
+            assert w2_gs is not None
+            w1_gs = w1_gs[s:e]
+            w2_gs = w2_gs[s:e]
 
-        if config.quant_dtype is None:
-            # just make normal dtype weights
-            w1, w2 = make_non_quant_weights(e=config.E,
-                                            n=config.N,
-                                            k=config.K,
-                                            dtype=config.dtype)
-            return WeightTensors(w1=w1, w2=w2, w1_scale=None, w2_scale=None)
-
-        assert config.quant_dtype == torch.float8_e4m3fn
-        if not config.is_fp8_block_quantized():
-            w1, w2, w1_scale, w2_scale = make_quant_fp8_weights(
-                e=config.E,
-                n=config.N,
-                k=config.K,
-                per_out_channel_quant=config.is_per_out_ch_quant,
-            )
-            return WeightTensors(w1=w1,
-                                 w2=w2,
-                                 w1_scale=w1_scale,
-                                 w2_scale=w2_scale)
+        return WeightTensors(w1, w2, w1_scale, w2_scale, w1_gs, w2_gs)
 
-        assert config.quant_block_shape is not None
-        w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights(
+    @staticmethod
+    def make(config: Config) -> "WeightTensors":
+        (_, w1, w1_scale, w1_gs), (_, w2, w2_scale, w2_gs) = make_test_weights(
             e=config.E,
             n=config.N,
             k=config.K,
-            block_size=config.quant_block_shape,
+            in_dtype=config.dtype,
+            quant_dtype=config.quant_dtype,
+            block_shape=config.quant_block_shape,
+            per_act_token_quant=config.is_per_out_ch_quant,
         )
         return WeightTensors(w1=w1,
                              w2=w2,
                              w1_scale=w1_scale,
-                             w2_scale=w2_scale)
+                             w2_scale=w2_scale,
+                             w1_gs=w1_gs,
+                             w2_gs=w2_gs)
 
 
 @dataclass
@@ -449,7 +403,6 @@ def make(config: Config, pgi: ProcessGroupInfo):
                             dtype=dtype)
         topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk,
                                                False)
-        topk_ids = topk_ids.to(config.topk_ids_dtype)
 
         # distribute topk_ids evenly
         for mi in range(m):
@@ -457,7 +410,7 @@ def make(config: Config, pgi: ProcessGroupInfo):
         topk_ids = topk_ids.to(device=torch.cuda.current_device())
 
         expert_map = None
-        if config.world_size > 1:
+        if config.world_size > 1 and config.supports_expert_map():
             expert_map = torch.full((global_num_experts, ),
                                     fill_value=-1,
                                     dtype=torch.int32)
@@ -480,92 +433,100 @@ def make(config: Config, pgi: ProcessGroupInfo):
 def reference_moe_impl(config: Config, weights: WeightTensors,
                        rank_tensors: RankTensors) -> torch.Tensor:
 
-    return torch_experts(a=rank_tensors.hidden_states,
-                         w1=weights.w1,
-                         w2=weights.w2,
+    if config.quant_dtype == "nvfp4":
+        quant_blocksize = 16
+        dtype = config.dtype
+
+        w1_q = weights.w1
+        w1_blockscale = weights.w1_scale
+        w1_gs = weights.w1_gs
+
+        w2_q = weights.w2
+        w2_blockscale = weights.w2_scale
+        w2_gs = weights.w2_gs
+
+        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) / torch.amax(
+            rank_tensors.hidden_states.flatten(), dim=-1)).to(torch.float32)
+
+        assert w1_gs is not None
+        assert w2_gs is not None
+        assert w1_blockscale is not None
+        assert w2_blockscale is not None
+
+        assert w1_blockscale.shape[1] % 128 == 0
+        assert w1_blockscale.shape[2] % 4 == 0
+        assert w2_blockscale.shape[1] % 128 == 0
+        assert w2_blockscale.shape[2] % 4 == 0
+
+        a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(
+            rank_tensors.hidden_states, a_global_scale)
+
+        a = dequantize_nvfp4_to_dtype(a_fp4,
+                                      a_scale_interleaved,
+                                      a_global_scale,
+                                      dtype=dtype,
+                                      device=a_fp4.device,
+                                      block_size=quant_blocksize)
+
+        e = w1_q.shape[0]
+        n = w1_q.shape[1] // 2
+        k = w2_q.shape[1]
+
+        w1 = torch.zeros((e, 2 * n, k), device="cuda", dtype=dtype)
+        w2 = torch.zeros((e, k, n), device="cuda", dtype=dtype)
+
+        for idx in range(0, e):
+            w1[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
+                                                w1_blockscale[idx],
+                                                w1_gs[idx],
+                                                dtype=dtype,
+                                                device=w1_q.device,
+                                                block_size=quant_blocksize)
+            w2[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
+                                                w2_blockscale[idx],
+                                                w2_gs[idx],
+                                                dtype=dtype,
+                                                device=w2_q.device,
+                                                block_size=quant_blocksize)
+        a_scale = None
+        w1_scale = None
+        w2_scale = None
+        quant_dtype = None
+        per_act_token_quant = False
+        block_shape = None
+    else:
+        a = rank_tensors.hidden_states
+        a_scale = rank_tensors.hidden_states_scale
+        w1 = weights.w1
+        w1_scale = weights.w1_scale
+        w2 = weights.w2
+        w2_scale = weights.w2_scale
+        quant_dtype = config.quant_dtype
+        per_act_token_quant = config.is_per_act_token_quant
+        block_shape = config.quant_block_shape
+
+    return torch_experts(a=a,
+                         w1=w1,
+                         w2=w2,
                          topk_weight=rank_tensors.topk_weights,
                          topk_ids=rank_tensors.topk_ids,
                          global_num_experts=config.E,
                          expert_map=None,
-                         w1_scale=weights.w1_scale,
-                         w2_scale=weights.w2_scale,
-                         a1_scale=rank_tensors.hidden_states_scale,
-                         quant_dtype=config.quant_dtype,
-                         per_act_token_quant=config.is_per_act_token_quant,
-                         block_shape=config.quant_block_shape,
-                         apply_router_weights_on_input=config.topk == 1)
-
-
-def make_fused_experts(
-        config: Config, moe: FusedMoEConfig,
-        num_dispatchers: int) -> mk.FusedMoEPermuteExpertsUnpermute:
-
-    use_fp8 = config.quant_dtype == torch.float8_e4m3fn
-    batch_kwargs = {
-        "max_num_tokens": moe.max_num_tokens,
-        "num_dispatchers": num_dispatchers,
-    }
-    quant_kwargs = {
-        "use_fp8_w8a8": use_fp8,
-        "use_int8_w8a8": False,
-        "use_int8_w8a16": False,
-        "use_int4_w4a16": False,
-        "block_shape": config.quant_block_shape,
-        "per_act_token_quant": config.is_per_act_token_quant,
-    }
-    deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()}
-
-    if config.fused_experts_type == BatchedDeepGemmExperts:
-        kwargs = batch_kwargs | {
-            "block_shape": config.quant_block_shape,
-            "per_act_token_quant": config.is_per_act_token_quant,
-        }
-        print(f"Making BatchedDeepGemmExperts {kwargs} ...")
-        experts = BatchedDeepGemmExperts(**kwargs)
-    elif config.fused_experts_type == BatchedTritonExperts:
-        kwargs = batch_kwargs | quant_kwargs
-        print(f"Making BatchedTritonExperts {kwargs} ...")
-        experts = BatchedTritonExperts(**kwargs)
-    elif config.fused_experts_type == BatchedTritonOrDeepGemmExperts:
-        kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
-        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
-        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
-    elif config.fused_experts_type == DeepGemmExperts:
-        print("Making DeepGemmExperts () ...")
-        experts = DeepGemmExperts()
-    elif config.fused_experts_type == TritonExperts:
-        kwargs = quant_kwargs
-        print(f"Making TritonExperts {kwargs} ...")
-        experts = TritonExperts(**kwargs)
-    elif config.fused_experts_type == TritonOrDeepGemmExperts:
-        kwargs = quant_kwargs | deepgemm_kwargs
-        print(f"Making TritonOrDeepGemmExperts {kwargs} ...")
-        experts = TritonOrDeepGemmExperts(**kwargs)
-    elif config.fused_experts_type == NaiveBatchedExperts:
-        kwargs = batch_kwargs | quant_kwargs
-        print(f"Making NaiveBatchedExperts {kwargs} ...")
-        experts = NaiveBatchedExperts(**kwargs)
-    elif config.fused_experts_type == CutlassExpertsFp8:
-        use_batched_format = config.is_batched_prepare_finalize()
-        num_experts = (moe.num_local_experts
-                       if use_batched_format else moe.num_experts)
-        kwargs = {
-            "max_experts_per_worker": num_experts,
-            "out_dtype": moe.in_dtype,
-            "per_act_token_quant": config.is_per_act_token_quant,
-            "per_out_ch_quant": config.is_per_out_ch_quant,
-            "block_shape": config.quant_block_shape,
-            "num_dispatchers": num_dispatchers,
-            "use_batched_format": use_batched_format
-        }
-        print(f"Making CutlassExpertsFp8 {kwargs} ...")
-        experts = CutlassExpertsFp8(**kwargs)
+                         w1_scale=w1_scale,
+                         w2_scale=w2_scale,
+                         a1_scale=a_scale,
+                         quant_dtype=quant_dtype,
+                         per_act_token_quant=per_act_token_quant,
+                         block_shape=block_shape,
+                         apply_router_weights_on_input=config.topk == 1
+                         and config.supports_apply_weight_on_input())
 
-    return experts
 
-
-def make_modular_kernel(config: Config,
-                        vllm_config: VllmConfig) -> mk.FusedMoEModularKernel:
+def make_modular_kernel(
+    config: Config,
+    vllm_config: VllmConfig,
+    weights: WeightTensors,
+) -> mk.FusedMoEModularKernel:
 
     def next_power_of_2(x):
         import math
@@ -579,6 +540,7 @@ def next_power_of_2(x):
         dp_size_=get_dp_group().world_size,
         vllm_parallel_config=vllm_config.parallel_config,
     )
+
     moe = FusedMoEConfig(
         num_experts=config.E,
         experts_per_token=config.topk,
@@ -591,15 +553,16 @@ def next_power_of_2(x):
     )
 
     # make modular kernel
-    prepare_finalize = None
-    if config.needs_all2all():
-        prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize(moe)
-        assert prepare_finalize is not None
-    else:
-        prepare_finalize = MoEPrepareAndFinalizeNoEP()
-
-    fused_experts = make_fused_experts(config, moe,
-                                       prepare_finalize.num_dispatchers())
+    prepare_finalize = make_prepare_finalize(config.prepare_finalize_type,
+                                             config.all2all_backend(), moe)
+
+    fused_experts = make_fused_experts(
+        config.fused_experts_type,
+        moe,
+        prepare_finalize.num_dispatchers(),
+        weights.w1_gs,
+        weights.w2_gs,
+    )
 
     modular_kernel = mk.FusedMoEModularKernel(
         prepare_finalize=prepare_finalize, fused_experts=fused_experts)
@@ -620,22 +583,45 @@ def run_modular_kernel(
     # weights for rank
     rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts)
 
-    mk = make_modular_kernel(config, vllm_config)
+    mk = make_modular_kernel(config, vllm_config, weights)
 
     mk_kwargs = {
-        "hidden_states": rank_tensors.hidden_states.clone(
+        "hidden_states":
+        rank_tensors.hidden_states.clone(
         ),  # impls might update the tensor in place
-        "w1": rank_weights.w1,
-        "w2": rank_weights.w2,
-        "topk_weights": rank_tensors.topk_weights,
-        "topk_ids": rank_tensors.topk_ids,
-        "expert_map": rank_tensors.expert_map,
-        "w1_scale": rank_weights.w1_scale,
-        "w2_scale": rank_weights.w2_scale,
-        "a1_scale": rank_tensors.hidden_states_scale,
-        "global_num_experts": config.E,
-        "apply_router_weight_on_input": config.topk == 1,
+        "w1":
+        rank_weights.w1,
+        "w2":
+        rank_weights.w2,
+        "topk_weights":
+        rank_tensors.topk_weights,
+        "topk_ids":
+        rank_tensors.topk_ids.to(mk.prepare_finalize.topk_indices_dtype()),
+        "expert_map":
+        rank_tensors.expert_map,
+        "w1_scale":
+        rank_weights.w1_scale,
+        "w2_scale":
+        rank_weights.w2_scale,
+        "a1_scale":
+        rank_tensors.hidden_states_scale,
+        "global_num_experts":
+        config.E,
+        "apply_router_weight_on_input":
+        config.topk == 1 and config.supports_apply_weight_on_input(),
     }
-    out = mk.forward(**mk_kwargs)
+
+    num_tokens = rank_tensors.hidden_states.shape[0]
+    num_tokens_across_dp = torch.tensor([num_tokens] * config.world_size,
+                                        device="cuda",
+                                        dtype=torch.int)
+
+    with set_forward_context(
+            None,
+            vllm_config,
+            num_tokens=num_tokens,
+            num_tokens_across_dp=num_tokens_across_dp,
+    ):
+        out = mk.forward(**mk_kwargs)
 
     return out
diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
index 73214066f7ea..aecffae36ae5 100644
--- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py
+++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py
@@ -1,58 +1,316 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Optional, Union
 
 import torch
 
 # Fused experts and PrepareFinalize imports
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
     BatchedDeepGemmExperts)
 from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
     BatchedTritonOrDeepGemmExperts)
-from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.config import (FusedMoEConfig,
+                                                         FusedMoEQuantConfig)
 from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedTritonExperts, NaiveBatchedExperts)
-from vllm.model_executor.layers.fused_moe.layer import TritonExperts
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoEMethodBase,
+                                                        TritonExperts)
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
 from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
     TritonOrDeepGemmExperts)
-from vllm.utils import has_deep_ep, has_pplx
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    cutlass_fp4_supported)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    cutlass_fp8_supported)
+from vllm.platforms import current_platform
+from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.deep_gemm import is_deep_gemm_supported
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
-if has_deep_ep():
+
+@dataclass
+class PrepareFinalizeInfo:
+    activation_format: mk.FusedMoEActivationFormat
+    supported_dtypes: list[Union[torch.dtype, str]]
+    blocked_quantization_support: bool
+    backend: Optional[str]
+    supports_apply_weight_on_input: bool = True
+
+
+@dataclass
+class ExpertInfo:
+    activation_format: mk.FusedMoEActivationFormat
+    supported_dtypes: list[Union[torch.dtype, str]]
+    blocked_quantization_support: bool
+    supports_chunking: bool
+    supports_expert_map: bool
+    needs_matching_quant: bool = False
+    needs_deep_gemm: bool = False
+
+
+PREPARE_FINALIZE_INFO: dict[mk.FusedMoEPrepareAndFinalize,
+                            PrepareFinalizeInfo] = {}
+EXPERT_INFO: dict[mk.FusedMoEPermuteExpertsUnpermute, ExpertInfo] = {}
+MK_ALL_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
+MK_MULTI_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
+MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES: list[mk.FusedMoEPrepareAndFinalize] = []
+MK_FUSED_EXPERT_TYPES: list[mk.FusedMoEPermuteExpertsUnpermute] = []
+
+standard_format = mk.FusedMoEActivationFormat.Standard
+batched_format = mk.FusedMoEActivationFormat.BatchedExperts
+common_float_types: list[Union[torch.dtype, str]] = [
+    torch.float8_e4m3fn, torch.bfloat16, torch.float16, torch.float32
+]
+common_float_and_int_types = common_float_types + [torch.int8]
+nv_fp4_types = ["nvfp4"]
+fp8_types = [torch.float8_e4m3fn]
+
+
+def register_prepare_and_finalize(
+    kind,
+    activation_format: mk.FusedMoEActivationFormat,
+    supported_dtypes: list[Union[torch.dtype, str]],
+    blocked_quantization_support: bool,
+    backend: Optional[str],
+    force_multigpu: bool = False,
+    supports_apply_weight_on_input: bool = True,
+):
+    global PREPARE_FINALIZE_INFO
+    global MK_ALL_PREPARE_FINALIZE_TYPES
+    global MK_MULTI_GPU_PREPARE_FINALIZE_TYPES
+    global MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES
+    assert kind not in PREPARE_FINALIZE_INFO
+
+    PREPARE_FINALIZE_INFO[kind] = PrepareFinalizeInfo(
+        activation_format,
+        supported_dtypes,
+        blocked_quantization_support,
+        backend,
+        supports_apply_weight_on_input,
+    )
+    MK_ALL_PREPARE_FINALIZE_TYPES.append(kind)
+    if backend is not None or force_multigpu:
+        MK_MULTI_GPU_PREPARE_FINALIZE_TYPES.append(kind)
+    else:
+        MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES.append(kind)
+
+
+def register_experts(
+    kind,
+    activation_format: mk.FusedMoEActivationFormat,
+    supported_dtypes: list[Union[torch.dtype, str]],
+    blocked_quantization_support: bool,
+    supports_chunking: bool,
+    supports_expert_map: bool,
+    needs_matching_quant: bool = False,
+    needs_deep_gemm: bool = False,
+):
+    global EXPERT_INFO
+    global MK_FUSED_EXPERT_TYPES
+    assert kind not in EXPERT_INFO
+
+    EXPERT_INFO[kind] = ExpertInfo(
+        activation_format,
+        supported_dtypes,
+        blocked_quantization_support,
+        supports_chunking,
+        supports_expert_map,
+        needs_matching_quant,
+        needs_deep_gemm,
+    )
+
+    MK_FUSED_EXPERT_TYPES.append(kind)
+
+
+def prepare_finalize_info(kind) -> PrepareFinalizeInfo:
+    info = PREPARE_FINALIZE_INFO.get(kind)
+    assert info is not None
+    return info
+
+
+def expert_info(kind) -> ExpertInfo:
+    info = EXPERT_INFO.get(kind)
+    assert info is not None
+    return info
+
+
+register_prepare_and_finalize(
+    MoEPrepareAndFinalizeNoEP,
+    standard_format,
+    common_float_types,
+    blocked_quantization_support=True,
+    backend=None,
+)
+
+register_experts(
+    BatchedTritonExperts,
+    batched_format,
+    common_float_types,
+    blocked_quantization_support=True,
+    supports_chunking=False,
+    supports_expert_map=False,
+    needs_matching_quant=True,
+)
+
+register_experts(
+    TritonExperts,
+    standard_format,
+    common_float_and_int_types,
+    blocked_quantization_support=True,
+    supports_chunking=True,
+    supports_expert_map=True,
+    needs_matching_quant=True,
+)
+
+register_experts(
+    NaiveBatchedExperts,
+    batched_format,
+    common_float_and_int_types,
+    blocked_quantization_support=True,
+    supports_chunking=False,
+    supports_expert_map=True,
+)
+
+# Disable on blackwell for now
+if has_deep_ep() and not current_platform.has_device_capability(100):
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
         DeepEPHTPrepareAndFinalize)
     from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
         DeepEPLLPrepareAndFinalize)
 
+    register_prepare_and_finalize(
+        DeepEPHTPrepareAndFinalize,
+        standard_format,
+        common_float_types,
+        blocked_quantization_support=True,
+        backend="deepep_high_throughput",
+    )
+
+    register_prepare_and_finalize(
+        DeepEPLLPrepareAndFinalize,
+        batched_format,
+        common_float_types,
+        blocked_quantization_support=True,
+        backend="deepep_low_latency",
+    )
+
 if has_pplx():
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
         PplxPrepareAndFinalize)
+    register_prepare_and_finalize(
+        PplxPrepareAndFinalize,
+        batched_format,
+        common_float_and_int_types,
+        blocked_quantization_support=True,
+        backend="pplx",
+    )
 
-MK_MULTI_GPU_PREPARE_FINALIZE_TYPES = []
-if has_pplx():
-    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [PplxPrepareAndFinalize]
-if has_deep_ep():
-    MK_MULTI_GPU_PREPARE_FINALIZE_TYPES += [
-        DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize
-    ]
+if (has_flashinfer_cutlass_fused_moe()
+        and current_platform.has_device_capability(100)):
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+        FlashInferExperts)
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+        FlashInferCutlassMoEPrepareAndFinalize)
 
-MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES = [MoEPrepareAndFinalizeNoEP]
+    register_prepare_and_finalize(
+        FlashInferCutlassMoEPrepareAndFinalize,
+        standard_format,
+        nv_fp4_types,
+        blocked_quantization_support=True,
+        backend=None,
+        force_multigpu=True,
+        supports_apply_weight_on_input=False,
+    )
 
-MK_ALL_PREPARE_FINALIZE_TYPES = (MK_MULTI_GPU_PREPARE_FINALIZE_TYPES +
-                                 MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+    register_experts(
+        FlashInferExperts,
+        standard_format,
+        nv_fp4_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        # Note: this is a hack to get it to run for now
+        supports_expert_map=True,
+    )
+else:
+    FlashInferCutlassMoEPrepareAndFinalize = None
 
-MK_FUSED_EXPERT_TYPES = [
-    BatchedDeepGemmExperts,
-    BatchedTritonExperts,
-    NaiveBatchedExperts,
-    BatchedTritonOrDeepGemmExperts,
-    CutlassExpertsFp8,
-    DeepGemmExperts,
-    TritonOrDeepGemmExperts,
-    TritonExperts,
-]
+if has_deep_gemm() and is_deep_gemm_supported():
+    register_experts(
+        BatchedDeepGemmExperts,
+        batched_format,
+        fp8_types,
+        blocked_quantization_support=True,
+        supports_chunking=False,
+        supports_expert_map=False,
+        needs_matching_quant=False,
+        needs_deep_gemm=True,
+    )
+    register_experts(
+        DeepGemmExperts,
+        standard_format,
+        fp8_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        supports_expert_map=True,
+        needs_matching_quant=False,
+        needs_deep_gemm=True,
+    ),
+    register_experts(
+        BatchedTritonOrDeepGemmExperts,
+        batched_format,
+        common_float_and_int_types,
+        blocked_quantization_support=True,
+        supports_chunking=False,
+        supports_expert_map=False,
+        needs_matching_quant=True,
+        needs_deep_gemm=True,
+    )
+    register_experts(
+        TritonOrDeepGemmExperts,
+        standard_format,
+        common_float_and_int_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        supports_expert_map=True,
+        needs_matching_quant=True,
+        needs_deep_gemm=True,
+    )
+
+if cutlass_fp8_supported():
+    from vllm.model_executor.layers.fused_moe import (CutlassBatchedExpertsFp8,
+                                                      CutlassExpertsFp8)
+    register_experts(
+        CutlassExpertsFp8,
+        standard_format,
+        fp8_types,
+        blocked_quantization_support=False,
+        supports_chunking=True,
+        supports_expert_map=False,
+    )
+    register_experts(
+        CutlassBatchedExpertsFp8,
+        batched_format,
+        fp8_types,
+        blocked_quantization_support=False,
+        supports_chunking=False,
+        supports_expert_map=False,
+    )
+
+if cutlass_fp4_supported():
+    from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+        CutlassExpertsFp4)
+    register_experts(
+        CutlassExpertsFp4,
+        standard_format,
+        nv_fp4_types,
+        blocked_quantization_support=True,
+        supports_chunking=True,
+        supports_expert_map=False,
+    )
 
 MK_QUANT_CONFIGS = [
     None,
@@ -85,3 +343,156 @@
     # block-quantized weights and per-token activations
     # block-quantized weights and per-tensor activations
 ]
+
+if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
+    MK_QUANT_CONFIGS += [
+        FusedMoEQuantConfig(quant_dtype="nvfp4",
+                            per_out_ch_quant=False,
+                            per_act_token_quant=False,
+                            block_shape=None),
+    ]
+
+
+def _make_gscale(num_experts: int) -> torch.Tensor:
+    return torch.ones((num_experts, ),
+                      device=torch.cuda.current_device(),
+                      dtype=torch.float32)
+
+
+def make_prepare_finalize(
+    prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
+    backend: Optional[str],
+    moe: FusedMoEConfig,
+) -> mk.FusedMoEPrepareAndFinalize:
+    if backend != "naive" and backend is not None:
+        prepare_finalize = FusedMoEMethodBase._maybe_make_prepare_finalize(moe)
+        assert prepare_finalize is not None
+        return prepare_finalize
+    elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
+        return FlashInferCutlassMoEPrepareAndFinalize(
+            use_dp=moe.moe_parallel_config.dp_size > 1,
+            a1_gscale=_make_gscale(moe.num_local_experts),
+        )
+    else:
+        return MoEPrepareAndFinalizeNoEP()
+
+
+def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
+    s = rank * num_local_experts
+    e = s + num_local_experts
+    return t[s:e]
+
+
+def make_fused_experts(
+    fused_experts_type: mk.FusedMoEPermuteExpertsUnpermute,
+    moe: FusedMoEConfig,
+    num_dispatchers: int,
+    w1_gs: Optional[torch.Tensor],
+    w2_gs: Optional[torch.Tensor],
+) -> mk.FusedMoEPermuteExpertsUnpermute:
+
+    use_fp8 = moe.quant_dtype == torch.float8_e4m3fn
+    batch_kwargs = {
+        "max_num_tokens": moe.max_num_tokens,
+        "num_dispatchers": num_dispatchers,
+    }
+    quant_kwargs = {
+        "use_fp8_w8a8": use_fp8,
+        "use_int8_w8a8": False,
+        "use_int8_w8a16": False,
+        "use_int4_w4a16": False,
+        "block_shape": moe.block_shape,
+        "per_act_token_quant": moe.per_act_token_quant,
+    }
+    deepgemm_kwargs = {"allow_deep_gemm": has_deep_gemm()}
+
+    if fused_experts_type == BatchedDeepGemmExperts:
+        kwargs = batch_kwargs | {
+            "block_shape": moe.block_shape,
+            "per_act_token_quant": moe.per_act_token_quant,
+        }
+        print(f"Making BatchedDeepGemmExperts {kwargs} ...")
+        experts = BatchedDeepGemmExperts(**kwargs)
+    elif fused_experts_type == BatchedTritonExperts:
+        kwargs = batch_kwargs | quant_kwargs
+        print(f"Making BatchedTritonExperts {kwargs} ...")
+        experts = BatchedTritonExperts(**kwargs)
+    elif fused_experts_type == BatchedTritonOrDeepGemmExperts:
+        kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
+        print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
+        experts = BatchedTritonOrDeepGemmExperts(**kwargs)
+    elif fused_experts_type == DeepGemmExperts:
+        print("Making DeepGemmExperts () ...")
+        experts = DeepGemmExperts()
+    elif fused_experts_type == TritonExperts:
+        kwargs = quant_kwargs
+        print(f"Making TritonExperts {kwargs} ...")
+        experts = TritonExperts(**kwargs)
+    elif fused_experts_type == TritonOrDeepGemmExperts:
+        kwargs = quant_kwargs | deepgemm_kwargs
+        print(f"Making TritonOrDeepGemmExperts {kwargs} ...")
+        experts = TritonOrDeepGemmExperts(**kwargs)
+    elif fused_experts_type == NaiveBatchedExperts:
+        kwargs = batch_kwargs | quant_kwargs
+        print(f"Making NaiveBatchedExperts {kwargs} ...")
+        experts = NaiveBatchedExperts(**kwargs)
+    elif fused_experts_type == CutlassExpertsFp8:
+        kwargs = {
+            "out_dtype": moe.in_dtype,
+            "per_act_token_quant": moe.per_act_token_quant,
+            "per_out_ch_quant": moe.per_out_ch_quant,
+            "block_shape": moe.block_shape,
+        }
+        print(f"Making CutlassExpertsFp8 {kwargs} ...")
+        experts = CutlassExpertsFp8(**kwargs)
+    elif fused_experts_type == CutlassBatchedExpertsFp8:
+        kwargs = {
+            "max_experts_per_worker": moe.num_local_experts,
+            "num_dispatchers": num_dispatchers,
+            "out_dtype": moe.in_dtype,
+            "per_act_token_quant": moe.per_act_token_quant,
+            "per_out_ch_quant": moe.per_out_ch_quant,
+            "block_shape": moe.block_shape,
+        }
+        print(f"Making CutlassBatchedExpertsFp8 {kwargs} ...")
+        experts = CutlassBatchedExpertsFp8(**kwargs)
+    elif fused_experts_type == CutlassExpertsFp4:
+        assert w1_gs is not None and w2_gs is not None
+        num_experts = moe.num_local_experts
+        rank = moe.moe_parallel_config.dp_rank
+        kwargs = {
+            "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)),
+            "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)),
+            "a1_gscale": _make_gscale(num_experts),
+            "a2_gscale": _make_gscale(num_experts),
+            "max_experts_per_worker": num_experts,
+            "out_dtype": moe.in_dtype,
+            "per_act_token_quant": moe.per_act_token_quant,
+            "per_out_ch_quant": moe.per_out_ch_quant,
+            "block_shape": moe.block_shape,
+            "num_dispatchers": num_dispatchers,
+        }
+        print(f"Making CutlassExpertsFp4 {kwargs} ...")
+        experts = CutlassExpertsFp4(**kwargs)
+    elif fused_experts_type == FlashInferExperts:
+        assert w1_gs is not None and w2_gs is not None
+        num_experts = moe.num_local_experts
+        rank = moe.moe_parallel_config.dp_rank
+        kwargs = {
+            "g1_alphas": _slice(rank, num_experts, (1 / w1_gs)),
+            "g2_alphas": _slice(rank, num_experts, (1 / w2_gs)),
+            "a1_gscale": _make_gscale(num_experts),
+            "a2_gscale": _make_gscale(num_experts),
+            "out_dtype": moe.in_dtype,
+            "quant_dtype": "nvfp4",
+            "ep_rank": moe.ep_rank,
+            "ep_size": moe.ep_size,
+            "tp_rank": moe.tp_rank,
+            "tp_size": moe.tp_size,
+        }
+        print(f"Making FlashInferExperts {kwargs} ...")
+        experts = FlashInferExperts(**kwargs)
+    else:
+        raise RuntimeError(f"Unknown fused experts type: {fused_experts_type}")
+
+    return experts
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index dd16ffb2eabe..0da6ee354352 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -52,7 +52,7 @@ def profile_modular_kernel(
     rank_weights = weights.slice_weights(pgi.rank, config.num_local_experts)
 
     # make modular kernel
-    mk = make_modular_kernel(config, vllm_config)
+    mk = make_modular_kernel(config, vllm_config, weights)
 
     mk_kwargs = {
         "hidden_states": rank_tensors.hidden_states,
@@ -83,7 +83,7 @@ def rank_worker(
     # sanity check
     from vllm import envs
     if config.fused_moe_chunk_size is not None:
-        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+        assert config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE
 
     # get weights to this device
     weights.to_current_device()
diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py
deleted file mode 100644
index 866f52882bee..000000000000
--- a/tests/kernels/moe/modular_kernel_tools/utils.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-import vllm._custom_ops as ops
-from vllm.utils.deep_gemm import per_block_cast_to_fp8
-
-
-def per_token_cast_to_fp8(
-        x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    pad_size = (block_size - (n % block_size)) % block_size
-    x = torch.nn.functional.pad(x,
-                                (0, pad_size), value=0) if pad_size > 0 else x
-    x_view = x.view(m, -1, block_size)
-    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
-    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
-
-
-def make_non_quant_weights(
-    e: int,
-    n: int,
-    k: int,
-    dtype: torch.dtype,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Return weights w1, w2
-    """
-    device = torch.cuda.current_device()
-    w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 15
-    w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 15
-    return w1, w2
-
-
-def make_block_quant_fp8_weights(
-    e: int,
-    n: int,
-    k: int,
-    block_size: list[int],
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Return weights w1, w2, w1_scale, w2_scale
-    """
-    dtype = torch.bfloat16
-    device = torch.cuda.current_device()
-
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
-    fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-    w1_bf16, w2_bf16 = make_non_quant_weights(e, n, k, dtype)
-    w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
-    w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
-
-    block_n, block_k = block_size[0], block_size[1]
-    n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
-    k_tiles_w1 = (k + block_k - 1) // block_k
-    n_tiles_w2 = (k + block_n - 1) // block_n
-    k_tiles_w2 = (n + block_k - 1) // block_k
-
-    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device)
-    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device)
-
-    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
-                       device=device,
-                       dtype=torch.float32)
-    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
-                       device=device,
-                       dtype=torch.float32)
-
-    assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n,
-                          (k + (block_k - 1)) // block_k)
-    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
-
-    for i in range(e):
-        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
-                                               block_size=[block_k, block_n])
-        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
-                                               block_size=[block_k, block_n])
-
-    return w1, w2, w1_s, w2_s
-
-
-def make_quant_fp8_weights(
-    e: int,
-    n: int,
-    k: int,
-    per_out_channel_quant: bool,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Return w1, w2, w1_scale, w2_scale
-    """
-    q_dtype = torch.float8_e4m3fn
-
-    w1, w2 = make_non_quant_weights(e, n, k, dtype=torch.bfloat16)
-
-    # w1 -> w1_q, w2 -> w2_q
-    w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
-    w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
-
-    n_b_scales = 2 * n if per_out_channel_quant else 1
-    k_b_scales = k if per_out_channel_quant else 1
-    w1_scale = torch.empty((e, n_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-    w2_scale = torch.empty((e, k_b_scales, 1),
-                           device="cuda",
-                           dtype=torch.float32)
-
-    for expert in range(e):
-        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
-            w1[expert], use_per_token_if_dynamic=per_out_channel_quant)
-        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
-            w2[expert], use_per_token_if_dynamic=per_out_channel_quant)
-    return w1_q, w2_q, w1_scale, w2_scale
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index edf3e6189243..00b2d780e66f 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -133,7 +133,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         per_act_token_quant=per_act_token_quant,
     )
 
-    B, B_q, B_scale, _, _, _ = make_test_weights(
+    (B, B_q, B_scale, _), _ = make_test_weights(
         num_experts,
         N // 2,
         K,
@@ -243,7 +243,7 @@ def test_fused_moe_batched_experts(
         act_dtype = dtype
         quant_dtype = None
 
-    w1_16, w1, w1_s, w2_16, w2, w2_s = make_test_weights(
+    (w1_16, w1, w1_s, _), (w2_16, w2, w2_s, _) = make_test_weights(
         e,
         n,
         k,
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 75b2e9f79178..9e4eaf221f24 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -161,18 +161,20 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.float8_e4m3fn,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
+    (_, w1, w1_s, _), (_, w2, w2_s,
+                       _) = make_test_weights(E,
+                                              N,
+                                              K,
+                                              dtype,
+                                              torch.float8_e4m3fn,
+                                              per_act_token_quant=False,
+                                              block_shape=block_size)
 
     m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=True,
                                            use_int8_w8a8=False,
                                            use_int8_w8a16=False,
                                            use_int4_w4a16=False,
+                                           use_mxfp4_w4a4=False,
                                            per_act_token_quant=False,
                                            block_shape=block_size)
 
@@ -247,13 +249,14 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.float8_e4m3fn,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
+    (_, w1, w1_s, _), (_, w2, w2_s,
+                       _) = make_test_weights(E,
+                                              N,
+                                              K,
+                                              dtype,
+                                              torch.float8_e4m3fn,
+                                              per_act_token_quant=False,
+                                              block_shape=block_size)
 
     # Note: for now use_compile will error out if the problem size is
     # large enough to trigger chunking. I'm leaving the flag and
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
index 8e680c722935..5e4a93963f8e 100644
--- a/tests/kernels/moe/test_block_int8.py
+++ b/tests/kernels/moe/test_block_int8.py
@@ -118,13 +118,14 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     a = torch.randn((M, K), dtype=dtype) / 10
     score = torch.randn((M, E), dtype=dtype)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(E,
-                                                 N,
-                                                 K,
-                                                 dtype,
-                                                 torch.int8,
-                                                 per_act_token_quant=False,
-                                                 block_shape=block_size)
+    (_, w1, w1_s, _), (_, w2, w2_s,
+                       _) = make_test_weights(E,
+                                              N,
+                                              K,
+                                              dtype,
+                                              torch.int8,
+                                              per_act_token_quant=False,
+                                              block_shape=block_size)
 
     # Set the context to avoid lots of warning spam.
     with set_current_vllm_config(vllm_config):
diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py
index 1aee1ed8c376..3b1618dacac7 100644
--- a/tests/kernels/moe/test_cutlass_grouped_gemm.py
+++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py
@@ -9,6 +9,7 @@
 import pytest
 import torch
 
+from tests.kernels.moe.utils import per_token_cast_to_fp8
 from tests.kernels.utils import baseline_scaled_mm
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
@@ -16,20 +17,6 @@
 from vllm.utils.deep_gemm import per_block_cast_to_fp8
 
 
-def per_token_cast_to_fp8(
-        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    pad_size = (128 - (n % 128)) % 128
-    x = torch.nn.functional.pad(x,
-                                (0, pad_size), value=0) if pad_size > 0 else x
-    x_view = x.view(m, -1, 128)
-    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
-    fp8_data = (x_view *
-                (448.0 / x_amax.unsqueeze(2))).to(dtype=torch.float8_e4m3fn)
-    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
-
-
 @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [
     (4, 8192, 7168, 4096),
     (4, 8192, 2048, 7168),
@@ -76,7 +63,7 @@ def test_cutlass_grouped_gemm(
                          device=device,
                          dtype=torch.float))
     for i in range(num_groups):
-        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i])
+        y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i], [128, 128])
 
     for i in range(num_groups):
         a = x_fp8[0][ep_offset[i]:ep_offset[i + 1]]
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 9b064db973dd..6f95581a5e60 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -70,8 +70,10 @@ def make_block_quant_fp8_weights(
     """
     Return weights w1q, w2q, w1_scale, w2_scale
     """
-    w1, w1q, w1_scale, w2, w2q, w2_scale = make_test_weights(
-        e, n, k, torch.bfloat16, torch.float8_e4m3fn, block_size)
+    (_, w1q, w1_scale, _), (_, w2q, w2_scale,
+                            _) = make_test_weights(e, n, k, torch.bfloat16,
+                                                   torch.float8_e4m3fn,
+                                                   block_size)
     return w1q, w2q, w1_scale, w2_scale
 
 
diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py
index b2b78662c9de..4472f34a6291 100644
--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -132,9 +132,9 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
 # Note: W1 has shape (E, 2N, K), so N = 512
 # can trigger the deepgemm path.
 MNKs = [
-    (1024, 512, 128),
-    (1024, 512, 512),
-    (2048, 512, 512),
+    (1024, 768, 128),
+    (1024, 768, 512),
+    (2048, 768, 512),
     (512, 1024, 1024),
     (512, 2048, 2048),
     (4096, 4096, 1024),
diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
new file mode 100644
index 000000000000..1c14df2b914a
--- /dev/null
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.moe.utils import make_test_weights
+from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
+                                                    FLOAT8_E4M3_MAX,
+                                                    dequantize_nvfp4_to_dtype)
+from tests.kernels.utils import torch_moe
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP)
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+if not has_flashinfer_cutlass_fused_moe(
+) or not current_platform.has_device_capability(100):
+    pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support",
+                allow_module_level=True)
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 2048, 1536),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+]
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [40, 64, 256])
+#@pytest.mark.parametrize("e", [128, 256])
+@pytest.mark.parametrize("topk", [1, 6, 8])
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16])
+@torch.inference_mode()
+def test_flashinfer_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
+                                     dtype: torch.dtype):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+
+        quant_blocksize = 16
+
+        (_, w1_q, w1_blockscale,
+         w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights(
+             e,
+             n,
+             k,
+             in_dtype=dtype,
+             quant_dtype="nvfp4",
+             block_shape=None,  # use quant_blocksize?
+             per_act_token_quant=False,
+         )
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a,
+                                               score,
+                                               topk,
+                                               renormalize=False)
+
+        a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
+        a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
+
+        assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q)
+
+        assert w1_gs is not None
+        assert w2_gs is not None
+        assert w1_blockscale is not None
+        assert w2_blockscale is not None
+
+        flashinfer_experts = FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            FlashInferExperts(
+                a1_gscale=a1_gs,
+                g1_alphas=(1 / w1_gs),
+                a2_gscale=a2_gs,
+                g2_alphas=(1 / w2_gs),
+                out_dtype=dtype,
+                quant_dtype="nvfp4",
+            ))
+
+        flashinfer_output = flashinfer_experts(
+            hidden_states=a,
+            w1=w1_q,
+            w1_scale=w1_blockscale,
+            w2=w2_q,
+            w2_scale=w2_blockscale,
+            a1_scale=a1_gs,
+            a2_scale=a2_gs,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+        )
+
+        # Reference check:
+        a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                          torch.amax(a.flatten(), dim=-1)).to(torch.float32)
+        a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
+        _, m_k = a_fp4.shape
+        a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
+                                               a_scale_interleaved,
+                                               a_global_scale,
+                                               dtype=a.dtype,
+                                               device=a.device,
+                                               block_size=quant_blocksize)
+
+        w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
+        w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
+
+        for idx in range(0, e):
+            w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
+                                                  w1_blockscale[idx],
+                                                  w1_gs[idx],
+                                                  dtype=dtype,
+                                                  device=w1_q.device,
+                                                  block_size=quant_blocksize)
+            w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
+                                                  w2_blockscale[idx],
+                                                  w2_gs[idx],
+                                                  dtype=dtype,
+                                                  device=w2_q.device,
+                                                  block_size=quant_blocksize)
+
+        torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
+
+        torch.testing.assert_close(torch_output,
+                                   flashinfer_output,
+                                   atol=1e-1,
+                                   rtol=1e-1)
+
+
+if __name__ == "__main__":
+    test_flashinfer_fp4_moe_no_graph((2, 1024, 1024), 40, 1, torch.half)
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index 6f2869c3a61d..d45982384eb3 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import copy
+import textwrap
+import traceback
 from itertools import product
 from typing import Optional
 
@@ -10,41 +12,51 @@
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import VllmConfig, current_platform, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
-    BatchedTritonOrDeepGemmExperts)
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
-from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedTritonExperts)
-from vllm.model_executor.layers.fused_moe.layer import TritonExperts
-from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-    TritonOrDeepGemmExperts)
 from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
                                           reference_moe_impl,
                                           run_modular_kernel)
 from .modular_kernel_tools.mk_objects import (
     MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
-    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, expert_info)
 from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
                                                   parallel_launch_with_config)
 
-# TODO (varun): These requirements are very strict and could be relaxed.
-has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx())
+has_any_multi_gpu_package = (has_deep_ep() or has_deep_gemm() or has_pplx()
+                             or has_flashinfer_cutlass_fused_moe())
 
-meets_package_requirements = pytest.mark.skipif(
-    not has_all_packages,
-    reason="Requires deep_ep & deep_gemm & pplx packages",
+meets_multi_gpu_requirements = pytest.mark.skipif(
+    not has_any_multi_gpu_package,
+    reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
 )
 
 
+def format_result(verbose, msg, ex=None):
+    if ex is not None:
+        x = str(ex)
+        newx = x.strip(" \n\t")[:16]
+        if len(newx) < len(x):
+            newx = newx + " ..."
+
+        prefix = "E\t"
+        print(f"{textwrap.indent(traceback.format_exc(), prefix)}")
+        print(f"FAILED {msg} - {newx}\n")
+    elif verbose:
+        print(f"PASSED {msg}")
+    else:
+        print(".", end="")
+
+
 def rank_worker(
     pgi: ProcessGroupInfo,
     vllm_config: VllmConfig,
     cpu_group,
     config: Config,
     weights: WeightTensors,
+    verbose: bool,
 ):
     current_platform.seed_everything(pgi.rank)
 
@@ -61,39 +73,64 @@ def rank_worker(
     TOPKs = config.topks
     assert isinstance(TOPKs, list)
 
-    for m, topk in product(Ms, TOPKs):
-        print(f"Running m={m}, topk={topk} ...")
-        # override m and topk
-        cfgx = copy.deepcopy(config)
-        cfgx.Ms = m
-        cfgx.topks = topk
-
-        # inputs for rank
-        rank_tensors = RankTensors.make(cfgx, pgi)
-
-        # modular kernel out
-        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
-                                    rank_tensors)
+    exceptions = []
+    count = 0
 
-        with set_current_vllm_config(vllm_config):
-            ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
-
-        torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2)
-
-
-def run(config: Config):
+    for m, topk in product(Ms, TOPKs):
+        try:
+            print(f"Running[{pgi.rank}]: m={m}, topk={topk} ...")
+            count = count + 1
+            # override m and topk
+            cfgx = copy.deepcopy(config)
+            cfgx.Ms = m
+            cfgx.topks = topk
+
+            # inputs for rank
+            rank_tensors = RankTensors.make(cfgx, pgi)
+
+            # modular kernel out
+            mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
+                                        rank_tensors)
+
+            with set_current_vllm_config(vllm_config):
+                ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
+
+            if config.quant_dtype == "nvfp4":
+                atol = 1e-1
+                rtol = 1e-1
+            else:
+                atol = 3e-2
+                rtol = 3e-2
+
+            torch.testing.assert_close(ref_out, mk_out, atol=atol, rtol=rtol)
+            format_result(verbose, config.describe())
+        except Exception as ex:
+            format_result(verbose, config.describe(), ex)
+            exceptions.append(ex)
+
+    if len(exceptions) > 0:
+        raise RuntimeError(
+            f"{len(exceptions)} of {count} tests failed in child process, "
+            f"rank={pgi.rank}.")
+    else:
+        print(f"{count} of {count} tests passed in child process, "
+              f"rank={pgi.rank}.")
+
+
+def run(config: Config, verbose: bool):
     assert config.is_valid()
-    print(f"Testing config \n{config.describe()} ...")
 
     weights: WeightTensors = WeightTensors.make(config)
 
     vllm_config, env_dict = config.make_env_data()
     parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
-                                env_dict, config, weights)
+                                env_dict, config, weights, verbose)
 
 
 Ms = [32, 64]
-Ks = [7168]  # hidden sizes
+# hidden sizes, making this too large will cause fp4 tests to fail.
+# Also needs to be a multiple of 1024 for deep_gemm.
+Ks = [2048]
 Ns = [2048]
 TOPKs = [4, 1]
 Es = [32]
@@ -103,19 +140,16 @@ def run(config: Config):
 
 def is_nyi_config(config: Config) -> bool:
     # We know these configs to be legitimate. but still fail.
+    info = expert_info(config.fused_experts_type)
 
-    if (config.fused_experts_type in [
-            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
-            TritonExperts, TritonOrDeepGemmExperts
-    ]):
+    if info.needs_matching_quant:
         # The triton kernels expect both per-act-token-quant and
         # per-out-ch-quant or neither.
         unsupported_quant_config = ((config.is_per_act_token_quant +
                                      config.is_per_out_ch_quant) == 1)
         return unsupported_quant_config
 
-    # cutlass kernels dont support expert_maps yet.
-    return config.fused_experts_type == CutlassExpertsFp8
+    return not info.supports_expert_map
 
 
 @pytest.mark.parametrize("k", Ks)
@@ -128,13 +162,13 @@ def is_nyi_config(config: Config) -> bool:
     product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [2])
-@meets_package_requirements
+@meets_multi_gpu_requirements
 def test_modular_kernel_combinations_multigpu(
         k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: FusedMoEQuantConfig,
+        quant_config: Optional[FusedMoEQuantConfig],
         combination: tuple[mk.FusedMoEPrepareAndFinalize,
                            mk.FusedMoEPermuteExpertsUnpermute],
-        fused_moe_chunk_size: Optional[int], world_size: int):
+        fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig):
 
     config = Config(
         Ms=Ms,
@@ -149,14 +183,15 @@ def test_modular_kernel_combinations_multigpu(
         fused_moe_chunk_size=fused_moe_chunk_size,
         world_size=world_size,
     )
+
     if not config.is_valid():
         pytest.skip(f"Tests config {config} is not valid. Skipping ...")
 
     if is_nyi_config(config):
         pytest.skip(f"Tests config {config} is nyi. Skipping ...")
 
-    print(f"{config.describe()}")
-    run(config)
+    verbosity = pytestconfig.getoption('verbose')
+    run(config, verbosity > 0)
 
 
 @pytest.mark.parametrize("k", Ks)
@@ -169,13 +204,12 @@ def test_modular_kernel_combinations_multigpu(
     product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
 @pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
 @pytest.mark.parametrize("world_size", [1])
-@meets_package_requirements
 def test_modular_kernel_combinations_singlegpu(
         k: int, n: int, e: int, dtype: torch.dtype,
-        quant_config: FusedMoEQuantConfig,
+        quant_config: Optional[FusedMoEQuantConfig],
         combination: tuple[mk.FusedMoEPrepareAndFinalize,
                            mk.FusedMoEPermuteExpertsUnpermute],
-        fused_moe_chunk_size: Optional[int], world_size: int):
+        fused_moe_chunk_size: Optional[int], world_size: int, pytestconfig):
     config = Config(
         Ms=Ms,
         K=k,
@@ -196,7 +230,8 @@ def test_modular_kernel_combinations_singlegpu(
     if is_nyi_config(config):
         pytest.skip(f"Tests config {config} is nyi. Skipping ...")
 
-    run(config)
+    verbosity = pytestconfig.getoption('verbose')
+    run(config, verbosity > 0)
 
 
 if __name__ == '__main__':
@@ -211,4 +246,4 @@ def test_modular_kernel_combinations_singlegpu(
     args = parser.parse_args()
     config = make_config(args)
 
-    run(config)
+    run(config, True)
diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py
index 3ff385360299..30388ef9375d 100644
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -3,6 +3,7 @@
 import pytest
 import torch
 
+from tests.kernels.moe.utils import make_test_weights
 from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
                                                     FLOAT8_E4M3_MAX,
                                                     dequantize_nvfp4_to_dtype)
@@ -43,41 +44,20 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
             VllmConfig(parallel_config=ParallelConfig(
                 pipeline_parallel_size=1))):
 
-        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
         quant_blocksize = 16
-        round_up = lambda x, y: (x + y - 1) // y * y
-        sf_w1_2n = round_up(2 * n, 128)
-        sf_w1_k = round_up(k // quant_blocksize, 4)
-        w1_blockscale = torch.empty((e, sf_w1_2n, sf_w1_k),
-                                    device="cuda",
-                                    dtype=torch.float8_e4m3fn)
-
-        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
-        sf_w2_k = round_up(k, 128)
-        sf_w2_n = round_up(n // quant_blocksize, 4)
-        w2_blockscale = torch.empty((e, sf_w2_k, sf_w2_n),
-                                    device="cuda",
-                                    dtype=torch.float8_e4m3fn)
-
-        w1_q = torch.empty((e, 2 * n, k // 2),
-                           device="cuda",
-                           dtype=torch.uint8)
-        w2_q = torch.empty((e, k, n // 2), device="cuda", dtype=torch.uint8)
-        w1_gs = torch.empty((e, ), device="cuda", dtype=torch.float32)
-        w2_gs = torch.empty((e, ), device="cuda", dtype=torch.float32)
-
-        for expert in range(e):
-            w1_amax = torch.abs(w1).max().to(torch.float32)
-            w2_amax = torch.abs(w2).max().to(torch.float32)
-            w1_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
-            w2_gs[expert] = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
-
-            w1_q[expert], w1_blockscale[expert] = ops.scaled_fp4_quant(
-                w1[expert], w1_gs[expert])
-
-            w2_q[expert], w2_blockscale[expert] = ops.scaled_fp4_quant(
-                w2[expert], w2_gs[expert])
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+
+        (_, w1_q, w1_blockscale,
+         w1_gs), (_, w2_q, w2_blockscale, w2_gs) = make_test_weights(
+             e,
+             n,
+             k,
+             in_dtype=dtype,
+             quant_dtype="nvfp4",
+             block_shape=None,  # use quant_blocksize?
+             per_act_token_quant=False,
+         )
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
         topk_weights, topk_ids, _ = fused_topk(a,
@@ -88,6 +68,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
         a1_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
         a2_gs = torch.ones((e, ), device="cuda", dtype=torch.float32)
 
+        assert w1_gs is not None
+        assert w2_gs is not None
+        assert w1_blockscale is not None
+        assert w2_blockscale is not None
+
         cutlass_output = cutlass_moe_fp4(
             a=a,
             a1_gscale=a1_gs,
@@ -104,14 +89,13 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
             n=n,
             k=k,
             e=e,
-            device=a.device,
         )
 
         # Reference check:
         a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
                           torch.amax(a.flatten(), dim=-1)).to(torch.float32)
         a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a, a_global_scale)
-        _, m_k = a_fp4.shape
+
         a_in_dtype = dequantize_nvfp4_to_dtype(a_fp4,
                                                a_scale_interleaved,
                                                a_global_scale,
@@ -126,14 +110,14 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
             w1_d[idx] = dequantize_nvfp4_to_dtype(w1_q[idx],
                                                   w1_blockscale[idx],
                                                   w1_gs[idx],
-                                                  dtype=w1.dtype,
-                                                  device=w1.device,
+                                                  dtype=dtype,
+                                                  device=w1_q.device,
                                                   block_size=quant_blocksize)
             w2_d[idx] = dequantize_nvfp4_to_dtype(w2_q[idx],
                                                   w2_blockscale[idx],
                                                   w2_gs[idx],
-                                                  dtype=w2.dtype,
-                                                  device=w2.device,
+                                                  dtype=dtype,
+                                                  device=w2_q.device,
                                                   block_size=quant_blocksize)
 
         torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index e4f4a393dfd5..f98937ee6c52 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -9,7 +9,8 @@
 from tests.kernels.utils import torch_experts
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    CutlassBatchedExpertsFp8)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
@@ -123,12 +124,8 @@ def pplx_cutlass_moe(
         num_local_experts=num_local_experts,
         num_dispatchers=num_dispatchers)
 
-    experts = CutlassExpertsFp8(num_local_experts,
-                                out_dtype,
-                                per_act_token,
-                                per_out_ch,
-                                num_dispatchers=num_dispatchers,
-                                use_batched_format=True)
+    experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers,
+                                       out_dtype, per_act_token, per_out_ch)
 
     fused_cutlass_experts = FusedMoEModularKernel(
         prepare_finalize,
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index fbef6706beaf..c2064de97358 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -770,7 +770,7 @@ def test_pplx_moe_slow(
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
     score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+    (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
         e,
         n,
         k,
@@ -836,7 +836,7 @@ def format_result(msg, ex=None):
 
         args = dict()
         if make_weights:
-            _, w1, w1_s, _, w2, w2_s = make_test_weights(
+            (_, w1, w1_s, _), (_, w2, w2_s, _) = make_test_weights(
                 e,
                 n,
                 k,
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index c33134981acc..82960bd57345 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 
 import vllm._custom_ops as ops
 from tests.kernels.quant_utils import per_block_cast_to_int8
+from tests.kernels.quantization.nvfp4_utils import (FLOAT4_E2M1_MAX,
+                                                    FLOAT8_E4M3_MAX)
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
@@ -169,28 +171,41 @@ def make_quantized_test_activations(
 def moe_quantize_weights(
     w: torch.Tensor,
     w_s: Optional[torch.Tensor],
-    quant_dtype: Optional[torch.dtype],
+    quant_dtype: Union[torch.dtype, str, None],
     per_token_quant: bool,
     block_shape: Optional[list[int]],
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-    assert (quant_dtype == torch.float8_e4m3fn
-            or quant_dtype == torch.int8), "only fp8/int8 supported"
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    assert (quant_dtype == torch.float8_e4m3fn or quant_dtype == torch.int8
+            or quant_dtype == "nvfp4"), "only fp8/int8/nvfp4 supported"
+
+    w_gs = None
 
     if block_shape is not None:
         assert not per_token_quant
         if quant_dtype == torch.int8:
             w, w_s = per_block_cast_to_int8(w, block_shape)
-        else:
+        elif quant_dtype == torch.float8_e4m3fn:
             w, w_s = per_block_cast_to_fp8(w, block_shape)
+        elif quant_dtype == "nvfp4":
+            raise RuntimeError("blocked quantization not supported for nvfp4")
+        else:
+            raise RuntimeError(f"Unsupported quant type {quant_dtype}")
     else:
         if quant_dtype == torch.int8:
             w, w_s = ops.scaled_int8_quant(
                 w, w_s, use_per_token_if_dynamic=per_token_quant)
-        else:
+        elif quant_dtype == torch.float8_e4m3fn:
             w, w_s = ops.scaled_fp8_quant(
                 w, w_s, use_per_token_if_dynamic=per_token_quant)
+        elif quant_dtype == "nvfp4":
+            assert not per_token_quant
+            w_amax = torch.abs(w).max().to(torch.float32)
+            w_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w_amax
+            w, w_s = ops.scaled_fp4_quant(w, w_gs)
+        else:
+            raise RuntimeError(f"Unsupported quant type {quant_dtype}")
 
-    return w, w_s
+    return w, w_s, w_gs
 
 
 def make_test_weight(
@@ -198,21 +213,26 @@ def make_test_weight(
     rows: int,
     cols: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Optional[torch.dtype] = None,
+    quant_dtype: Union[torch.dtype, str, None] = None,
     block_shape: Optional[list[int]] = None,
     per_act_token_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
+           Optional[torch.Tensor]]:
     w_16 = torch.randn((e, rows, cols), device="cuda", dtype=in_dtype) / 15
+    w_gs = None
 
     if quant_dtype is not None:
         w_l = [None] * e
         w_s_l = [None] * e
+        w_gs_l = [None] * e
         for idx in range(e):
-            w_l[idx], w_s_l[idx] = moe_quantize_weights(
+            w_l[idx], w_s_l[idx], w_gs_l[idx] = moe_quantize_weights(
                 w_16[idx], None, quant_dtype, per_act_token_quant, block_shape)
 
         w = torch.stack(w_l)
         w_s = torch.stack(w_s_l)
+        if e > 0 and w_gs_l[0] is not None:
+            w_gs = torch.stack(w_gs_l)
         if w_s.ndim == 2:
             assert w_s.shape[-1] == 1
             w_s = w_s.view(-1, 1, 1)
@@ -225,8 +245,9 @@ def make_test_weight(
     else:
         w = w_16
         w_s = None
+        w_gs = None
 
-    return w_16, w, w_s
+    return w_16, w, w_s, w_gs
 
 
 def make_test_weights(
@@ -234,14 +255,30 @@ def make_test_weights(
     n: int,
     k: int,
     in_dtype: torch.dtype = torch.bfloat16,
-    quant_dtype: Optional[torch.dtype] = None,
+    quant_dtype: Union[torch.dtype, str, None] = None,
     block_shape: Optional[list[int]] = None,
     per_act_token_quant: bool = False,
-) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor,
-           torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
+                 Optional[torch.Tensor]],
+           tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
+                 Optional[torch.Tensor]]]:
     return (
-        *make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape,
-                          per_act_token_quant),
-        *make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape,
-                          per_act_token_quant),
+        make_test_weight(e, 2 * n, k, in_dtype, quant_dtype, block_shape,
+                         per_act_token_quant),
+        make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape,
+                         per_act_token_quant),
     )
+
+
+def per_token_cast_to_fp8(
+        x: torch.Tensor,
+        block_size: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (block_size - (n % block_size)) % block_size
+    x = torch.nn.functional.pad(x,
+                                (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, block_size)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 127a340fc6c6..9e5aa4e4c2a8 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -105,7 +105,8 @@ def __init__(self,
             # we initialize the all2all manager used in expert parallel.
             use_ep = config.parallel_config.data_parallel_size > 1
 
-        self.use_all2all = "ep" in unique_name and use_ep
+        self.is_ep_communicator = "ep" in unique_name
+        self.use_all2all = self.is_ep_communicator and use_ep
         self.all2all_manager: Optional[All2AllManagerBase] = None
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
@@ -246,7 +247,7 @@ def prepare_communication_buffer_for_model(self,
         """
         Prepare the communication buffer for the model.
         """
-        if not self.use_all2all:
+        if not self.is_ep_communicator:
             return
 
         moe_modules = [
@@ -254,7 +255,7 @@ def prepare_communication_buffer_for_model(self,
             if module.__class__.__name__ == "FusedMoE"
         ]
         for module in moe_modules:
-            module.quant_method.init_prepare_finalize(module.moe_config)
+            module.quant_method.init_prepare_finalize()
 
     def dispatch(
             self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 3d40879b4ccb..3007643d7a28 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -49,7 +49,8 @@ def get_config() -> Optional[dict[str, Any]]:
     from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
         BatchedTritonOrDeepGemmExperts)
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-        CutlassExpertsFp8, cutlass_moe_fp4, cutlass_moe_fp8)
+        CutlassBatchedExpertsFp8, CutlassExpertsFp8, cutlass_moe_fp4,
+        cutlass_moe_fp8)
     from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
         DeepGemmExperts)
     from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
@@ -69,6 +70,7 @@ def get_config() -> Optional[dict[str, Any]]:
         "cutlass_moe_fp8",
         "cutlass_moe_fp4",
         "CutlassExpertsFp8",
+        "CutlassBatchedExpertsFp8",
         "TritonExperts",
         "BatchedTritonExperts",
         "DeepGemmExperts",
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index c48a0137c306..d9cfe96f7a03 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -254,18 +254,28 @@ def workspace_shapes(
         output = (num_experts, max_num_tokens * num_dispatchers, K)
         return (workspace13, workspace2, output, a.dtype)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         assert expert_tokens_meta is not None
         expert_num_tokens = expert_tokens_meta.expert_num_tokens
 
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
index fc30e84e6656..89d7412ee223 100644
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -132,18 +132,28 @@ def workspace_shapes(
                 a, aq, M, N, K, topk, global_num_experts, local_num_experts,
                 expert_tokens_metadata)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         experts = (self.batched_deep_gemm_experts
                    if self.allow_deep_gemm else self.batched_triton_experts)
         assert experts is not None
@@ -151,4 +161,4 @@ def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
                       activation, global_num_experts, expert_map, w1_scale,
                       w2_scale, w1_zp, w2_zp, a1q_scale, a2_scale, workspace13,
                       workspace2, expert_tokens_meta,
-                      apply_router_weight_on_input, extra_expert_args)
+                      apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 31ea826f1f97..7c1a7b636a9c 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -45,7 +45,6 @@ def get_quant_config_weight_quant(
     return _get_quant_config_quantization_args(quant_config, "weights")
 
 
-# TODO (bnell): use scalar_type instead of bools?
 def get_config_quant_dtype(
     use_fp8_w8a8: bool,
     use_int8_w8a8: bool,
@@ -65,7 +64,8 @@ def get_config_quant_dtype(
 @dataclass
 class FusedMoEQuantConfig:
     # The post quantization activation type.
-    quant_dtype: Optional[torch.dtype] = None
+    # TODO (bnell): use scalar_type instead of Union.
+    quant_dtype: Union[torch.dtype, str, None] = None
     per_act_token_quant: bool = False
     per_out_ch_quant: bool = False
     block_shape: Optional[list[int]] = None
@@ -141,6 +141,7 @@ def make(
                 use_int8_w8a8,
                 use_int8_w8a16,
                 use_int4_w4a16,
+                use_mxfp4_w4a4,
             ]
         ]) <= 1, "Quantization flags are mutually exclusive."
 
@@ -334,7 +335,7 @@ def __post_init__(self):
         assert self.max_num_tokens > 0
 
     @property
-    def quant_dtype(self) -> Optional[torch.dtype]:
+    def quant_dtype(self) -> Union[torch.dtype, str, None]:
         if self.quant_config is not None:
             return self.quant_config.quant_dtype
         else:
@@ -429,7 +430,7 @@ def make(
                 block_shape = None
             per_act_token_quant = False
             per_out_ch_quant = False
-            quant_dtype: Optional[torch.dtype] = None
+            quant_dtype: Union[torch.dtype, str, None] = None
 
             input_quant = get_quant_config_input_quant(quant_config)
             weight_quant = get_quant_config_weight_quant(quant_config)
@@ -453,7 +454,7 @@ def make(
                 ModelOptNvFp4Config)
             if quant_dtype is None and isinstance(quant_config,
                                                   ModelOptNvFp4Config):
-                quant_dtype = torch.uint8
+                quant_dtype = "nvfp4"
 
             if weight_quant is not None:
                 per_out_ch_quant = (
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index 2585a2953c9d..0a02b558d09e 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """ CUTLASS based Fused MoE kernels."""
-from typing import Any, Callable, Optional
+from typing import Callable, Optional
 
 import torch
 
@@ -12,11 +12,10 @@
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoEP)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
+    TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP)
 from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm,
                                                         _fp8_quantize,
-                                                        _resize_cache,
-                                                        extract_required_args)
+                                                        _resize_cache)
 from vllm.scalar_type import scalar_types
 
 logger = init_logger(__name__)
@@ -213,19 +212,14 @@ def run_cutlass_moe_fp8(
         output.copy_(c3[c_map].view(M * topk, K), non_blocking=True)
 
 
-# TODO (bnell): split class batched vs. non-batched?
-# maybe remove need for passing aq to workspace_shapes
-class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
+class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
-        max_experts_per_worker: int,
         out_dtype: Optional[torch.dtype],
         per_act_token_quant: bool,
         per_out_ch_quant: bool,
         block_shape: Optional[list[int]] = None,
-        num_dispatchers: Optional[int] = None,
-        use_batched_format: bool = False,
     ):
         super().__init__(
             FusedMoEQuantConfig(
@@ -234,33 +228,84 @@ def __init__(
                 per_out_ch_quant=per_out_ch_quant,
                 block_shape=block_shape,
             ))
-        assert max_experts_per_worker > 0
-        assert not use_batched_format or num_dispatchers is not None
-        self.max_experts_per_worker = max_experts_per_worker
-        self.num_dispatchers = num_dispatchers
         self.out_dtype = out_dtype
-        self.use_batched_format = use_batched_format
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
+        assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
+        assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
+
+        expert_num_tokens = None
+        if expert_tokens_meta is not None:
+            expert_num_tokens = expert_tokens_meta.expert_num_tokens
+
+        activation_callable = lambda o, i: self.activation(activation, o, i)
+
+        use_batched_format = self.activation_formats[
+            0] == mk.FusedMoEActivationFormat.BatchedExperts
+
+        in_dtype = hidden_states.dtype
+        run_cutlass_moe_fp8(
+            output, hidden_states, w1, w2, topk_ids, activation_callable,
+            global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale,
+            a2_scale, workspace13, workspace2, expert_num_tokens,
+            self.out_dtype if self.out_dtype is not None else in_dtype,
+            self.per_act_token_quant, self.per_out_ch_quant,
+            use_batched_format)
+
+
+class CutlassExpertsFp8(CutlassExpertsFp8Base):
+
+    def __init__(
+        self,
+        out_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        per_out_ch_quant: bool,
+        block_shape: Optional[list[int]] = None,
+    ):
+        super().__init__(
+            out_dtype,
+            per_act_token_quant,
+            per_out_ch_quant,
+            block_shape,
+        )
 
     @property
     def activation_formats(
         self
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        if self.use_batched_format:
-            return (mk.FusedMoEActivationFormat.BatchedExperts,
-                    mk.FusedMoEActivationFormat.BatchedExperts)
-        else:
-            return (mk.FusedMoEActivationFormat.Standard,
-                    mk.FusedMoEActivationFormat.Standard)
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
 
     def supports_chunking(self) -> bool:
-        return not self.use_batched_format
+        return True
 
     def supports_expert_map(self) -> bool:
-        return not self.use_batched_format
-
-    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        # Let PrepareAndFinalize::finalize() decide the impl.
-        return TopKWeightAndReduceDelegate()
+        return True
 
     def workspace_shapes(
         self,
@@ -274,54 +319,69 @@ def workspace_shapes(
         local_num_experts: int,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
-        workspace1: tuple[int, ...] = ()
-        workspace2: tuple[int, ...] = ()
-        output: tuple[int, ...] = ()
-        if self.use_batched_format:
-            padded_M = aq.size(1)
-            num_dp = self.num_dispatchers
-            assert num_dp is not None
-            workspace1 = (self.max_experts_per_worker, padded_M * num_dp,
-                          max(N, K))
-            workspace2 = (self.max_experts_per_worker, padded_M * num_dp,
-                          (N // 2))
-            output = (self.max_experts_per_worker, padded_M, K)
-        else:
-            workspace1 = (M * topk, max(N, K))
-            workspace2 = (M * topk, N // 2)
-            output = (M * topk, K)
+        workspace1 = (M * topk, max(N, K))
+        workspace2 = (M * topk, N // 2)
+        output = (M * topk, K)
         return (workspace1, workspace2, output,
                 self.out_dtype if self.out_dtype is not None else a.dtype)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
-        assert w1_zp is None, "w1_zp is not supported in CUTLASS MoE"
-        assert w2_zp is None, "w2_zp is not supported in CUTLASS MoE"
 
-        expert_num_tokens = None
-        if expert_tokens_meta is not None:
-            expert_num_tokens = expert_tokens_meta.expert_num_tokens
+class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
 
-        activation_callable = lambda o, i: self.activation(activation, o, i)
+    def __init__(
+        self,
+        max_experts_per_worker: int,
+        num_dispatchers: int,
+        out_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        per_out_ch_quant: bool,
+        block_shape: Optional[list[int]] = None,
+    ):
+        super().__init__(
+            out_dtype,
+            per_act_token_quant,
+            per_out_ch_quant,
+            block_shape,
+        )
+        assert max_experts_per_worker > 0
+        self.max_experts_per_worker = max_experts_per_worker
+        self.num_dispatchers = num_dispatchers
 
-        in_dtype = hidden_states.dtype
-        run_cutlass_moe_fp8(
-            output, hidden_states, w1, w2, topk_ids, activation_callable,
-            global_num_experts, expert_map, w1_scale, w2_scale, a1q_scale,
-            a2_scale, workspace13, workspace2, expert_num_tokens,
-            self.out_dtype if self.out_dtype is not None else in_dtype,
-            self.per_act_token_quant, self.per_out_ch_quant,
-            self.use_batched_format)
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
+    def supports_chunking(self) -> bool:
+        return False
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    # TODO(bnell): maybe remove need for passing aq to workspace_shapes
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        padded_M = aq.size(1)
+        num_dp = self.num_dispatchers
+        assert num_dp is not None
+        workspace1 = (self.max_experts_per_worker, padded_M * num_dp,
+                      max(N, K))
+        workspace2 = (self.max_experts_per_worker, padded_M * num_dp, (N // 2))
+        output = (self.max_experts_per_worker, padded_M, K)
+        return (workspace1, workspace2, output,
+                self.out_dtype if self.out_dtype is not None else a.dtype)
 
 
 def cutlass_moe_fp8(
@@ -387,11 +447,9 @@ def cutlass_moe_fp8(
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp8(
-            max_experts_per_worker=num_experts,
             out_dtype=a.dtype,
             per_act_token_quant=per_act_token,
             per_out_ch_quant=per_out_ch,
-            use_batched_format=False,
         ),
     )
 
@@ -476,8 +534,9 @@ def run_cutlass_moe_fp4(
     e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
     e_w2, k_w2, half_n_w2 = w2_fp4.shape
 
-    assert (e_w1 == e_w2 and e_w1 == e), ("Number of experts must match",
-                                          " between weights.")
+    assert (e_w1 == e_w2
+            and e_w1 == e), ("Number of experts must match",
+                             f" between weights. {e_w1}, {e_w2}, {e}")
     assert (k_a == half_k_w1 * 2
             and k == k_w2), ("Hidden size mismatch between a, w1 and w2")
     assert (nx2_w1 == n * 2 and half_n_w2 * 2 == n), ("mismatch in "
@@ -554,6 +613,10 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
+        g1_alphas: torch.Tensor,
+        g2_alphas: torch.Tensor,
+        a1_gscale: torch.Tensor,
+        a2_gscale: torch.Tensor,
         max_experts_per_worker: int,
         out_dtype: torch.dtype,
         per_act_token_quant: bool,
@@ -562,8 +625,12 @@ def __init__(
         use_batched_format: bool = False,
     ):
         super().__init__(
+            # NVFP4 requires two levels of quantization, which involves
+            # computing some scaling factors dynamically. This makes it
+            # incompatible with the typical prepare -> MoE -> finalize
+            # pipeline. Move the quantization logic into the MoE body.
             FusedMoEQuantConfig(
-                quant_dtype=torch.uint8,
+                quant_dtype=None,  # skip quantization in prepare/finalize
                 per_act_token_quant=per_act_token_quant,
                 per_out_ch_quant=per_out_ch_quant,
                 block_shape=block_shape,
@@ -572,6 +639,12 @@ def __init__(
         self.out_dtype = out_dtype
         self.use_batched_format = use_batched_format
 
+        # TODO(bnell): put this stuff into quant config?
+        self.g1_alphas = g1_alphas
+        self.g2_alphas = g2_alphas
+        self.a1_gscale = a1_gscale
+        self.a2_gscale = a2_gscale
+
     @property
     def activation_formats(
         self
@@ -590,8 +663,7 @@ def supports_chunking(self) -> bool:
         return True
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        # Let PrepareAndFinalize::finalize() decide the impl.
-        return TopKWeightAndReduceDelegate()
+        return TopKWeightAndReduceNoOP()
 
     def workspace_shapes(
         self,
@@ -620,34 +692,42 @@ def workspace_shapes(
         return (workspace1, workspace2, output,
                 self.out_dtype if self.out_dtype is not None else a.dtype)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor], w1_scale: torch.Tensor,
-              w2_scale: torch.Tensor, w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: torch.Tensor, workspace13: Optional[torch.Tensor],
-              workspace2: Optional[torch.Tensor],
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
-        required_keys = [
-            "g1_alphas", "g2_alphas", "a1_gscale", "a2_gscale", "m", "n", "k",
-            "e", "device"
-        ]
-        (g1_alphas, g2_alphas, a1_gscale, a2_gscale, m, n, k, e,
-         device) = extract_required_args(extra_expert_args, required_keys)
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: torch.Tensor,
+        workspace13: Optional[torch.Tensor],
+        workspace2: Optional[torch.Tensor],
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
+        e, m, n, k, _ = mk._moe_problem_size(hidden_states, w1, w2, topk_ids)
+        n = w2.shape[2] * 2
+
         run_cutlass_moe_fp4(
             output=output,
             a=hidden_states,
-            a1_gscale=a1_gscale,
+            a1_gscale=self.a1_gscale,
             w1_fp4=w1,
             w1_blockscale=w1_scale,
-            w1_alphas=g1_alphas,
-            a2_gscale=a2_gscale,
+            w1_alphas=self.g1_alphas,
+            a2_gscale=self.a2_gscale,
             w2_fp4=w2,
             w2_blockscale=w2_scale,
-            w2_alphas=g2_alphas,
+            w2_alphas=self.g2_alphas,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             workspace13=workspace13,
@@ -656,7 +736,7 @@ def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
             n=n,
             k=k,
             e=e,
-            device=device,
+            device=hidden_states.device,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
 
@@ -677,7 +757,6 @@ def cutlass_moe_fp4(
         n: int,
         k: int,
         e: int,
-        device: torch.device,
         expert_map: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False) -> torch.Tensor:
     assert expert_map is None, ("Expert Parallelism / expert_map "
@@ -686,6 +765,10 @@ def cutlass_moe_fp4(
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp4(
+            g1_alphas,
+            g2_alphas,
+            a1_gscale,
+            a2_gscale,
             max_experts_per_worker=e,
             out_dtype=a.dtype,
             per_act_token_quant=False,
@@ -693,29 +776,7 @@ def cutlass_moe_fp4(
             use_batched_format=False,
         ),
     )
-    extra_expert_args = {
-        'g1_alphas': g1_alphas,
-        'g2_alphas': g2_alphas,
-        'a1_gscale': a1_gscale,
-        'a2_gscale': a2_gscale,
-        'm': m,
-        'n': n,
-        'k': k,
-        'e': e,
-        'device': device,
-    }
-
-    # NVFP4 requires two levels of quantization, which involves computing some
-    # scaling factors dynamically. This makes it incompatible with the typical
-    # prepare -> MoE -> finalize pipeline. Move the quantization logic into the
-    # MoE body.
-    extra_prepare_args = {
-        'skip_quant': True,
-    }
-    # Similar reason as above.
-    extra_finalize_args = {
-        'skip_weight_reduce': True,
-    }
+
     return fn(
         hidden_states=a,
         w1=w1_fp4,
@@ -731,9 +792,6 @@ def cutlass_moe_fp4(
         a1_scale=None,
         a2_scale=None,
         apply_router_weight_on_input=apply_router_weight_on_input,
-        extra_expert_args=extra_expert_args,
-        extra_prepare_args=extra_prepare_args,
-        extra_finalize_args=extra_finalize_args,
     )
 
 
@@ -824,16 +882,6 @@ def run_cutlass_block_scaled_fused_experts(
     k = w1_q.size(1)
     n = w2_q.size(1)
 
-    expert_offsets = torch.empty((num_experts + 1, ),
-                                 dtype=torch.int32,
-                                 device="cuda")
-    problem_sizes1 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device="cuda")
-    problem_sizes2 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device="cuda")
-
     topk = topk_ids.size(1)
 
     a_q, a1_scale = _fp8_quantize(a,
@@ -842,6 +890,16 @@ def run_cutlass_block_scaled_fused_experts(
                                   block_shape=[128, 128])
     device = a_q.device
 
+    expert_offsets = torch.empty((num_experts + 1, ),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+
     a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
     c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
index 9b8175f42a9d..7b8467a5a0cf 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 from tqdm import tqdm
@@ -230,7 +230,6 @@ def apply(
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ):
         assert self.block_shape is not None
         assert a1q_scale is not None
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
index f6b62254e7b4..437e569d3130 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import deep_ep
 import torch
@@ -127,12 +127,16 @@ def _do_dispatch(self, tokens: torch.Tensor,
                 expert_topk_weights)
 
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -187,11 +191,15 @@ def prepare(
         return (expert_x, expert_x_scale, expert_tokens_meta, expert_topk_ids,
                 expert_topk_weights)
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
 
         assert self.handle is not None
 
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index cfc2bdcf0240..93ac11fb4bfb 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import deep_ep
 import torch
@@ -77,7 +77,7 @@ def _do_quant(
         a1_scale: Optional[torch.Tensor],
         a2_scale: Optional[torch.Tensor],
         a1_dtype: torch.dtype,
-        quant_dtype: Optional[torch.dtype],
+        quant_dtype: Union[torch.dtype, str, None],
         per_act_token_quant: bool,
         block_shape: Optional[list[int]],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
@@ -111,12 +111,16 @@ def _do_quant(
         return x, x_scales
 
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -162,11 +166,15 @@ def prepare(
 
         return (expert_x, expert_x_scale, expert_tokens_meta, None, None)
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
         assert isinstance(
             weight_and_reduce_impl, TopKWeightAndReduceDelegate
         ), ("Weight application and reduction happens in the combine kernel.")
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 4e3e15a35ada..3fbe2a0bc69b 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional, Union
 
 import torch
 
@@ -8,8 +8,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceDelegate)
-from vllm.model_executor.layers.fused_moe.utils import extract_required_args
+    TopKWeightAndReduceNoOP)
 from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe,
                                    has_flashinfer_cutlass_fused_moe)
 
@@ -20,7 +19,7 @@ def is_valid_flashinfer_cutlass_fused_moe(hidden_states: torch.Tensor,
                                           w1: torch.Tensor,
                                           w2: torch.Tensor) -> bool:
     """
-    Check if the given problem size is supported by the FlashInfer CUTLASS MoE 
+    Check if the given problem size is supported by the FlashInfer CUTLASS MoE
     kernel.
     """
     if not has_flashinfer_cutlass_fused_moe():
@@ -43,31 +42,34 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
         self,
-        use_nvfp4_w4a4: bool = False,
-        use_fp8_w8a8: bool = False,
-        use_dp: bool = False,
+        g1_alphas: torch.Tensor,
+        g2_alphas: torch.Tensor,
+        a1_gscale: torch.Tensor,
+        a2_gscale: torch.Tensor,
+        out_dtype: torch.dtype,
+        quant_dtype: Union[torch.dtype, str, None],
         ep_rank: int = 0,
         ep_size: int = 1,
         tp_rank: int = 0,
         tp_size: int = 1,
-        num_dispatchers: Optional[int] = None,
-        use_batched_format: bool = False,
     ):
         super().__init__(
             FusedMoEQuantConfig(
-                quant_dtype=torch.uint8,
+                quant_dtype=quant_dtype,
                 per_act_token_quant=False,
                 block_shape=None,
             ))
-        self.use_nvfp4_w4a4 = use_nvfp4_w4a4
-        self.use_fp8_w8a8 = use_fp8_w8a8
+        assert quant_dtype == "nvfp4", ("Only nvfp4 quantization is "
+                                        "currently supported.")
         self.ep_rank = ep_rank
         self.ep_size = ep_size
         self.tp_rank = tp_rank
         self.tp_size = tp_size
-        self.use_dp = use_dp
-        assert not use_batched_format or num_dispatchers is not None
-        self.num_dispatchers = num_dispatchers
+        self.g1_alphas = g1_alphas
+        self.g2_alphas = g2_alphas
+        self.a1_gscale = a1_gscale
+        self.a2_gscale = a2_gscale
+        self.out_dtype = out_dtype
 
     @property
     def activation_formats(
@@ -84,8 +86,7 @@ def supports_chunking(self) -> bool:
         return True
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        # Let PrepareAndFinalize::finalize() decide the impl.
-        return TopKWeightAndReduceDelegate()
+        return TopKWeightAndReduceNoOP()
 
     def workspace_shapes(
         self,
@@ -117,8 +118,6 @@ def workspace_shapes(
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is "
-                                             "currently supported.")
         aq_m, aq_n = aq.shape
         workspace2 = ()
         output_shape = (aq_m, aq_n * 2)
@@ -149,21 +148,9 @@ def apply(
         workspace2: Optional[torch.Tensor],
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: Optional[bool],
-        extra_expert_args: Optional[dict[str, Any]],
     ):
-        assert extra_expert_args is not None, \
-            "extra_expert_args must be provided"
-        required_keys = [
-            'g1_alphas', 'g2_alphas', 'a1_gscale', 'a2_gscale', 'out_dtype'
-        ]
-
-        g1_alphas, g2_alphas, a1_gscale, a2_gscale, out_dtype = (
-            extract_required_args(extra_expert_args, required_keys))
-
         # Flashinfer CUTLASS kernel takes scalar global scales,
         # min because inv_scale.
-        assert self.use_nvfp4_w4a4 is True, ("Only nvfp4 quantization is "
-                                             "currently supported.")
 
         # Ensure w1_scale and w2_scale are not None before calling view
         assert w1_scale is not None and w2_scale is not None, (
@@ -171,12 +158,12 @@ def apply(
             "be None for FlashInferExperts")
 
         quant_scales = [
-            a1_gscale,
+            self.a1_gscale,
             w1_scale.view(torch.int32),
-            g1_alphas,
-            a2_gscale,
+            self.g1_alphas,
+            self.a2_gscale,
             w2_scale.view(torch.int32),
-            g2_alphas,
+            self.g2_alphas,
         ]
         _ = flashinfer_cutlass_fused_moe(
             input=hidden_states,
@@ -185,7 +172,7 @@ def apply(
             # FlashInfer API requires weight to be long for nvfp4
             fc1_expert_weights=w1.view(torch.long),
             fc2_expert_weights=w2.view(torch.long),
-            output_dtype=out_dtype,
+            output_dtype=self.out_dtype,
             quant_scales=quant_scales,
             input_sf=a1q_scale,
             tp_size=self.tp_size,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index 36aca8cf74b6..061b02172c44 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -9,7 +9,7 @@
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
-    extract_required_args, moe_kernel_quantize_input)
+    moe_kernel_quantize_input)
 from vllm.utils.flashinfer import nvfp4_block_scale_interleave
 
 
@@ -21,16 +21,15 @@ class FlashInferCutlassMoEPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     def __init__(
         self,
-        quant_dtype: Optional[torch.dtype] = None,
-        per_channel_quant: bool = False,
-        block_shape: Optional[list[int]] = None,
+        use_dp: bool,
+        a1_gscale: Optional[torch.Tensor],
         num_dispatchers: int = 1,
     ):
         super().__init__()
-        self.per_channel_quant = per_channel_quant
-        self.block_shape = block_shape
-        self.quant_dtype = quant_dtype
         self.num_dispatchers_ = num_dispatchers
+        self.use_dp = use_dp
+        self.a1_gscale = a1_gscale
+        self.local_tokens = None
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -55,10 +54,11 @@ def prepare(
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        # TODO(bnell): use quant_config + scales instead of ctor args
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
-               Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
@@ -67,22 +67,22 @@ def prepare(
                 "apply_router_weight_on_input is only implemented for topk=1"
             a1.mul_(topk_weights.to(a1.dtype))
 
-        (a1_gscale, use_dp, local_tokens) = extract_required_args(
-            extra_prepare_args, ['a1_gscale', 'use_dp', 'local_tokens'])
-
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1,
-            a1_gscale,
+            self.a1_gscale,
             quant_config.quant_dtype,
-            self.per_channel_quant,
-            self.block_shape,
-            is_fp4_scale_swizzled=not use_dp,  # Swizzling after communication
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+            # Swizzling after communication
+            is_fp4_scale_swizzled=not self.use_dp,
         )
-        if use_dp:
+        if self.use_dp:
             topk_weights, topk_ids, a1q, a1q_scale = \
-                get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], # noqa: E501
-                                           dim=0,
-                                           sizes=get_local_sizes())
+                get_dp_group().all_gatherv(
+                    [topk_weights, topk_ids, a1q, a1q_scale],
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
             a1_m, a1_n = a1q.shape
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
@@ -91,13 +91,9 @@ def prepare(
     def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
                  topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+                 weight_and_reduce_impl: mk.TopKWeightAndReduce) -> None:
 
-        (use_dp,
-         local_tokens) = extract_required_args(extra_finalize_args,
-                                               ['use_dp', 'local_tokens'])
-        if use_dp:
+        if self.use_dp:
             fused_expert_output = get_dp_group().reduce_scatterv(
                 fused_expert_output, dim=0, sizes=get_local_sizes())
         output.copy_(fused_expert_output)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 9a5c85e120cc..b46f4be4b912 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused batched MoE kernel."""
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -496,12 +496,16 @@ def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -590,11 +594,15 @@ def prepare(
 
         return b_a1, b_a1_scale, expert_tokens_meta, None, None
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
         if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
             weight_and_reduce_impl = TopKWeightAndReduceNaiveBatched(self.rank)
         weight_and_reduce_impl.apply(
@@ -688,18 +696,28 @@ def dequant(self, t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
         else:
             return t.to(f32) * group_broadcast(scale, t.shape)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         assert hidden_states.dim() == 3
         assert expert_tokens_meta is not None
         expert_num_tokens = expert_tokens_meta.expert_num_tokens
@@ -894,18 +912,28 @@ def workspace_shapes(
         output = (num_experts, max_num_tokens * num_dp, K)
         return (workspace13, workspace2, output, a.dtype)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         # Check constraints.
         if self.use_int4_w4a16:
             assert hidden_states.size(-1) // 2 == w1.size(2), (
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1c497fa5521b..e58a9e568d4a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1394,9 +1394,9 @@ def fused_experts(hidden_states: torch.Tensor,
     # E8M0 scale, which means we requantize the weight and input to the specific
     # scale. Fallen back to cutlass or triton for some cases would cause
     # accuracy issue.
-    should_use_deep_gemm = is_blackwell_deep_gemm_e8m0_used(
-    ) or _valid_deep_gemm(hidden_states, w1, w2)
-    if (allow_deep_gemm and use_fp8_w8a8 and should_use_deep_gemm):
+    if (allow_deep_gemm and use_fp8_w8a8
+            and (is_blackwell_deep_gemm_e8m0_used()
+                 or _valid_deep_gemm(hidden_states, w1, w2))):
         assert apply_router_weight_on_input is False
         assert is_act_and_mul, (
             "DeepGemm only supports is_act_and_mul=True for now.")
@@ -1905,7 +1905,6 @@ def apply(
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ):
         # Check constraints.
         if self.use_int4_w4a16:
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 6b5284dc6c96..312befe2c1d7 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Optional
 
 import torch
 
@@ -8,7 +8,6 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate)
-from vllm.model_executor.layers.fused_moe.utils import extract_required_args
 from vllm.utils import has_triton_kernels
 
 logger = init_logger(__name__)
@@ -160,12 +159,16 @@ def __init__(
         num_dispatchers: int,
         w1_precision: "PrecisionConfig",
         w2_precision: "PrecisionConfig",
+        w1_bias: Optional[torch.Tensor],
+        w2_bias: Optional[torch.Tensor],
     ):
         super().__init__(quant_config)
         self.max_num_tokens = max_num_tokens
         self.num_dispatchers = num_dispatchers
         self.w1_precision = w1_precision
         self.w2_precision = w2_precision
+        self.w1_bias = w1_bias
+        self.w2_bias = w2_bias
 
     @property
     def activation_formats(
@@ -219,11 +222,7 @@ def apply(
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ):
-        w1_bias, w2_bias = (extract_required_args(extra_expert_args,
-                                                  ["w1_bias", "w2_bias"]))
-
         return triton_kernel_fused_experts(
             output,
             hidden_states,
@@ -240,8 +239,8 @@ def apply(
             expert_map=expert_map,
             w1_scale=w1_scale,
             w2_scale=w2_scale,
-            w1_bias=w1_bias,
-            w2_bias=w2_bias,
+            w1_bias=self.w1_bias,
+            w2_bias=self.w2_bias,
             w1_precision=self.w1_precision,
             w2_precision=self.w2_precision,
             a1_scale=a1q_scale,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 36e75825853e..c3c6e4782750 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -37,7 +37,6 @@
 from vllm.platforms.interface import CpuArchEnum
 from vllm.utils import (direct_register_custom_op, has_deep_ep, has_pplx,
                         round_up)
-from vllm.utils.flashinfer import has_flashinfer
 
 if current_platform.is_cuda_alike():
     from .fused_batched_moe import BatchedTritonExperts
@@ -49,9 +48,6 @@
         from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
         from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
                                                  DeepEPLLPrepareAndFinalize)
-    if has_flashinfer():
-        from .flashinfer_cutlass_prepare_finalize import (
-            FlashInferCutlassMoEPrepareAndFinalize)
 else:
     fused_experts = None  # type: ignore
     FusedMoEPermuteExpertsUnpermute = None  # type: ignore
@@ -80,7 +76,12 @@ class FusedMoeWeightScaleSupported(Enum):
 
 class FusedMoEMethodBase(QuantizeMethodBase):
 
-    moe: FusedMoEConfig
+    # TODO(bnell): also pass quant_config?
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__()
+        self.moe = moe
+        self.fused_experts: Optional[Callable] = None
+        self.topk_indices_dtype = None
 
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -99,16 +100,16 @@ def uses_weight_scale_2_pattern(self) -> bool:
         return False
 
     @staticmethod
-    def maybe_make_prepare_finalize(
-            moe: FusedMoEConfig) -> Optional[FusedMoEPrepareAndFinalize]:
+    def _maybe_make_prepare_finalize(
+        moe: FusedMoEConfig, ) -> Optional[FusedMoEPrepareAndFinalize]:
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
         prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
 
-        if moe.use_flashinfer_cutlass_kernels:
-            prepare_finalize = FlashInferCutlassMoEPrepareAndFinalize(
-                quant_dtype=moe.quant_dtype, )
+        assert not moe.use_flashinfer_cutlass_kernels, \
+            "Must be created in modelopt.py"
+
         if moe.use_pplx_kernels:
             hidden_dim_bytes, hidden_scale_bytes = pplx_hidden_dim_scale_bytes(
                 moe.max_num_tokens,
@@ -188,14 +189,25 @@ def maybe_make_prepare_finalize(
 
         return prepare_finalize
 
-    def init_prepare_finalize(self, moe: FusedMoEConfig):
-        self.moe = moe
-        prepare_finalize = FusedMoEMethodBase.maybe_make_prepare_finalize(
-            self.moe)
+    def maybe_make_prepare_finalize(
+        self,
+        moe: FusedMoEConfig,
+    ) -> Optional[FusedMoEPrepareAndFinalize]:
+        if moe.moe_parallel_config.use_all2all_kernels:
+            return FusedMoEMethodBase._maybe_make_prepare_finalize(moe)
+        else:
+            return None
+
+    def init_prepare_finalize(self):
+        assert self.moe is not None
+        prepare_finalize = self.maybe_make_prepare_finalize(self.moe)
 
-        self.topk_indices_dtype = None
         if prepare_finalize is not None:
-            logger.debug("%s", prepare_finalize.__class__.__name__)
+            logger.debug("%s for %s(%s)", prepare_finalize.__class__.__name__,
+                         self, id(self))
+            assert self.topk_indices_dtype is None
+            assert self.fused_experts is None, \
+                f"Attempt to override experts for {id(self)}!"
             self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
             experts = self.select_gemm_impl(prepare_finalize, self.moe)
             self.fused_experts = FusedMoEModularKernel(
@@ -214,12 +226,6 @@ def select_gemm_impl(
             f"{self.__class__.__name__} must select appropriate gemm "
             "implementation based on the prepare_finalize")
 
-    def maybe_swap_experts_impl(
-        self,
-        moe_parallel_config: FusedMoEParallelConfig,
-    ):
-        pass
-
     @abstractmethod
     def apply(
         self,
@@ -251,10 +257,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
     def __init__(self, moe: FusedMoEConfig):
-        super().__init__()
-        self.fused_experts = fused_experts  # type: ignore
-        self.topk_indices_dtype = None
-        self.moe = moe
+        super().__init__(moe)
         self.has_bias = self.moe.has_bias
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
         if self.rocm_aiter_moe_enabled:
@@ -266,6 +269,7 @@ def __init__(self, moe: FusedMoEConfig):
     def select_gemm_impl(
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
+        # TODO(bnell): Remove. Every layer should have an moe config object.
         moe: FusedMoEConfig,
     ) -> FusedMoEPermuteExpertsUnpermute:
         if (prepare_finalize.activation_format ==
@@ -474,12 +478,30 @@ def forward_cuda(
                 expert_map=expert_map,
                 activation=activation,
                 apply_router_weight_on_input=apply_router_weight_on_input)
+        elif self.fused_experts is not None:
+            if self.has_bias:
+                raise ValueError(
+                    "FusedMoEModularKernel does not support bias.")
+            return self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+            )
         else:
-            # add w1_bias/w2_bias to kwargs if they exist
-            kwargs = dict(
+            assert fused_experts is not None
+            return fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
+                w1_bias=layer.w13_bias if self.has_bias else None,
+                w2_bias=layer.w2_bias if self.has_bias else None,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
                 inplace=True,
@@ -488,17 +510,6 @@ def forward_cuda(
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
             )
-            if isinstance(self.fused_experts,
-                          FusedMoEModularKernel) and self.has_bias:
-                raise ValueError(
-                    "FusedMoEModularKernel does not support bias.")
-            if self.has_bias:
-                kwargs.update({
-                    "w1_bias": getattr(layer, "w13_bias", None),
-                    "w2_bias": getattr(layer, "w2_bias", None),
-                })
-
-            return self.fused_experts(**kwargs)
 
     def forward_cpu(
         self,
@@ -868,8 +879,6 @@ def __init__(
             moe_quant_params["intermediate_size_full"] = intermediate_size
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
-        if isinstance(self.quant_method, FusedMoEMethodBase):
-            self.quant_method.maybe_swap_experts_impl(self.moe_parallel_config)
 
         # Chunked all2all staging tensor
         self.batched_hidden_states: Optional[torch.Tensor] = None
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 6262904e4dca..2ea6383d5ae9 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from math import prod
-from typing import Any, Optional, final
+from typing import Optional, final
 
 import torch
 
@@ -150,15 +150,23 @@ class FusedMoEPrepareAndFinalize(ABC):
 
     @abstractmethod
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[ExpertTokensMetadata], Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
+    ) -> tuple[
+            torch.Tensor,
+            Optional[torch.Tensor],
+            Optional[ExpertTokensMetadata],
+            Optional[torch.Tensor],
+            Optional[torch.Tensor],
+    ]:
         """
         Perform any quantization (and/or) dispatching needed
         for this kernel.
@@ -186,11 +194,15 @@ def prepare(
         raise NotImplementedError
 
     @abstractmethod
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: TopKWeightAndReduce,
+    ) -> None:
         """
         Perform any combine plus apply weights and perform a reduction on the
         fused experts output.
@@ -368,7 +380,6 @@ def apply(
         workspace2: torch.Tensor,
         expert_tokens_meta: Optional[ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ):
         """
         This function computes the intermediate result of a Mixture of Experts
@@ -454,18 +465,27 @@ def __init__(
                 f"{fused_experts.activation_formats[0]}")
 
     def _do_fused_experts(
-            self, fused_out: Optional[torch.Tensor], a1: torch.Tensor,
-            a1q: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
-            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-            activation: str, global_num_experts: int, local_num_experts: int,
-            expert_map: Optional[torch.Tensor],
-            w1_scale: Optional[torch.Tensor], w2_scale: Optional[torch.Tensor],
-            w1_zp: Optional[torch.Tensor], w2_zp: Optional[torch.Tensor],
-            a1q_scale: Optional[torch.Tensor],
-            a2_scale: Optional[torch.Tensor],
-            expert_tokens_meta: Optional[ExpertTokensMetadata],
-            apply_router_weight_on_input: bool,
-            extra_expert_args: Optional[dict[str, Any]]) -> torch.Tensor:
+        self,
+        fused_out: Optional[torch.Tensor],
+        a1: torch.Tensor,
+        a1q: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        expert_tokens_meta: Optional[ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ) -> torch.Tensor:
 
         _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
 
@@ -509,7 +529,7 @@ def _do_fused_experts(
             workspace2=workspace2,
             expert_tokens_meta=expert_tokens_meta,
             apply_router_weight_on_input=apply_router_weight_on_input,
-            extra_expert_args=extra_expert_args)
+        )
 
         return fused_out
 
@@ -533,7 +553,6 @@ def _maybe_chunk_fused_experts(
         a2_scale: Optional[torch.Tensor],
         expert_tokens_meta: Optional[ExpertTokensMetadata],
         apply_router_weight_on_input: bool,
-        extra_expert_args: Optional[dict[str, Any]],
     ) -> torch.Tensor:
 
         _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids)
@@ -541,6 +560,9 @@ def _maybe_chunk_fused_experts(
         CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
         num_chunks = cdiv(M, CHUNK_SIZE)
 
+        # TODO(bnell): get rid of one level here, update slice functions
+        # to nops on num_chunks==1
+
         if not self.fused_experts.supports_chunking() or num_chunks == 1:
             return self._do_fused_experts(
                 fused_out=None,
@@ -562,7 +584,7 @@ def _maybe_chunk_fused_experts(
                 a2_scale=a2_scale,
                 expert_tokens_meta=expert_tokens_meta,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                extra_expert_args=extra_expert_args)
+            )
 
         # Chunking required case
         assert num_chunks > 1
@@ -618,15 +640,6 @@ def slice_expert_tokens_metadata(
                 expert_num_tokens=c_expert_num_tokens,
                 expert_num_tokens_cpu=c_expert_num_tokens_cpu)
 
-        m = None
-        if extra_expert_args is not None and 'm' in extra_expert_args:
-            m = extra_expert_args.get('m')
-
-        if extra_expert_args is not None:
-            chunked_extra_expert_args = extra_expert_args
-        else:
-            chunked_extra_expert_args = {}
-
         for chunk_idx in range(num_chunks):
             c_a1q, c_a1q_scale, c_a2_scale, c_topk_ids, c_topk_weights = (
                 slice_input_tensors(chunk_idx))
@@ -637,11 +650,6 @@ def slice_expert_tokens_metadata(
                     expert_tokens_meta, c_topk_ids, local_num_experts,
                     expert_map)
 
-            s = chunk_idx * CHUNK_SIZE
-            e = min(s + CHUNK_SIZE, M)
-
-            if m is not None:
-                chunked_extra_expert_args['m'] = e - s
             self._do_fused_experts(
                 fused_out=slice_output_tensor(chunk_idx),
                 a1=a1,
@@ -662,7 +670,7 @@ def slice_expert_tokens_metadata(
                 a2_scale=c_a2_scale,
                 expert_tokens_meta=c_expert_tokens_meta,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                extra_expert_args=chunked_extra_expert_args)
+            )
 
         return fused_out
 
@@ -684,9 +692,6 @@ def forward(
         a1_scale: Optional[torch.Tensor] = None,
         a2_scale: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
-        extra_expert_args: Optional[dict] = None,
-        extra_prepare_args: Optional[dict] = None,
-        extra_finalize_args: Optional[dict] = None,
     ) -> torch.Tensor:
         """
         This function computes a Mixture of Experts (MoE) layer using two sets
@@ -719,12 +724,6 @@ def forward(
         - apply_router_weight_on_input (bool): When true, the topk weights are
           applied directly on the inputs. This is only applicable when topk is
           1.
-        - extra_expert_args (Optional[dict]): Extra keyword arguments to pass to
-          fused_experts.apply.
-        - extra_prepare_args (Optional[dict]): Extra keyword arguments to pass
-          to prepare.
-        - extra_finalize_args (Optional[dict]): Extra keyword arguments to pass 
-          to finalize.
 
         Returns:
         - torch.Tensor: The output tensor after applying the MoE layer.
@@ -748,7 +747,6 @@ def forward(
              expert_map,
              apply_router_weight_on_input,
              self.fused_experts.quant_config,
-             extra_prepare_args,
          )
 
         # Maybe prepare gathered topk_ids and topk_weights from other EP ranks.
@@ -786,12 +784,15 @@ def forward(
                 a2_scale=a2_scale,
                 expert_tokens_meta=expert_tokens_meta,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                extra_expert_args=extra_expert_args)
+            )
 
         self.prepare_finalize.finalize(
-            output, fused_out, topk_weights, topk_ids,
+            output,
+            fused_out,
+            topk_weights,
+            topk_ids,
             apply_router_weight_on_input,
             self.fused_experts.finalize_weight_and_reduce_impl(),
-            extra_finalize_args)
+        )
 
         return output
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 46931f2dd7c7..401f37922b7b 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional, Union
 
 import pplx_kernels as pplx
 import torch
@@ -21,7 +21,7 @@ def pplx_hidden_dim_scale_bytes(
     max_num_tokens: int,
     hidden_dim: int,
     in_dtype: torch.dtype,
-    quant_dtype: Optional[torch.dtype],
+    quant_dtype: Union[torch.dtype, str, None],
     per_act_token_quant: bool,
     block_shape: Optional[list[int]],
 ):
@@ -32,6 +32,7 @@ def pplx_hidden_dim_scale_bytes(
     #   ceil_div(hidden_dim, block_size) * sizeof(float32)
     # For per-token: set to 4 * sizeof(float32) (x4 for alignment)
     if quant_dtype is not None:
+        assert isinstance(quant_dtype, torch.dtype)
         assert quant_dtype.itemsize == 1
         hidden_dim_bytes = hidden_dim * quant_dtype.itemsize
         elem_size = torch.float32.itemsize
@@ -89,12 +90,16 @@ def num_dispatchers(self) -> int:
         return self.num_dispatchers_
 
     def prepare(
-        self, a1: torch.Tensor, a1_scale: Optional[torch.Tensor],
-        a2_scale: Optional[torch.Tensor], topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor, num_experts: int,
-        expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool,
+        self,
+        a1: torch.Tensor,
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]]
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -213,11 +218,15 @@ def prepare(
 
         return expert_x, expert_x_scale, expert_tokens_meta, None, None
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
         assert isinstance(
             weight_and_reduce_impl, TopKWeightAndReduceDelegate
         ), ("Weight application and reduction happens in the combine kernel.")
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 696c7cdba9a7..567a0a88fec0 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -38,7 +38,6 @@ def prepare(
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
         quant_config: FusedMoEQuantConfig,
-        extra_prepare_args: Optional[dict[str, Any]],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[mk.ExpertTokensMetadata], Optional[torch.Tensor],
                Optional[torch.Tensor]]:
@@ -50,32 +49,26 @@ def prepare(
                 "apply_router_weight_on_input is only implemented for topk=1"
             a1.mul_(topk_weights.to(a1.dtype))
 
-        if (extra_prepare_args is not None
-                and extra_prepare_args.get("skip_quant", True)):
-            # Skip quantization if explicitly requested
-            return a1, None, None, None, None
-
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1, a1_scale, quant_config.quant_dtype,
             quant_config.per_act_token_quant, quant_config.block_shape)
 
         return a1q, a1q_scale, None, None, None
 
-    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
-                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 apply_router_weight_on_input: bool,
-                 weight_and_reduce_impl: mk.TopKWeightAndReduce,
-                 extra_finalize_args: Optional[dict[str, Any]]) -> None:
-        if (extra_finalize_args is not None
-                and extra_finalize_args.get("skip_weight_reduce", True)):
-            assert output.shape == fused_expert_output.shape
-            output.copy_(fused_expert_output)
-        else:
-            if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
-                weight_and_reduce_impl = TopKWeightAndReduceContiguous()
-            weight_and_reduce_impl.apply(
-                output=output,
-                fused_expert_output=fused_expert_output,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                apply_router_weight_on_input=apply_router_weight_on_input)
+    def finalize(
+        self,
+        output: torch.Tensor,
+        fused_expert_output: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        apply_router_weight_on_input: bool,
+        weight_and_reduce_impl: mk.TopKWeightAndReduce,
+    ) -> None:
+        if isinstance(weight_and_reduce_impl, TopKWeightAndReduceDelegate):
+            weight_and_reduce_impl = TopKWeightAndReduceContiguous()
+        weight_and_reduce_impl.apply(
+            output=output,
+            fused_expert_output=fused_expert_output,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            apply_router_weight_on_input=apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 9d0ff2e06190..486ca881df48 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any, Optional
+from typing import Optional
 
 import torch
 
@@ -119,18 +119,28 @@ def workspace_shapes(
                                                        local_num_experts,
                                                        expert_tokens_meta)
 
-    def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
-              w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor,
-              topk_ids: torch.Tensor, activation: str, global_num_experts: int,
-              expert_map: Optional[torch.Tensor],
-              w1_scale: Optional[torch.Tensor],
-              w2_scale: Optional[torch.Tensor], w1_zp: Optional[torch.Tensor],
-              w2_zp: Optional[torch.Tensor], a1q_scale: Optional[torch.Tensor],
-              a2_scale: Optional[torch.Tensor], workspace13: torch.Tensor,
-              workspace2: torch.Tensor,
-              expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
-              apply_router_weight_on_input: bool,
-              extra_expert_args: Optional[dict[str, Any]]):
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
         use_deep_gemm = (self.allow_deep_gemm
                          and (_valid_deep_gemm(hidden_states, w1, w2)
                               or is_blackwell_deep_gemm_e8m0_used()))
@@ -158,5 +168,4 @@ def apply(self, output: torch.Tensor, hidden_states: torch.Tensor,
             workspace2,
             expert_tokens_meta,
             apply_router_weight_on_input,
-            extra_expert_args,
         )
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 966471b5c59b..4c3e700ad399 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from math import prod
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import torch
 
@@ -189,7 +189,7 @@ def moe_kernel_quantize_input(
         return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape)
     elif quant_dtype == torch.int8:
         return _int8_quantize(A, A_scale, per_act_token_quant, block_shape)
-    elif quant_dtype == torch.uint8:  # nvfp4
+    elif quant_dtype == "nvfp4":
         return _fp4_quantize(A,
                              A_scale,
                              is_sf_swizzled_layout=is_fp4_scale_swizzled)
@@ -252,17 +252,3 @@ def _validate_scale_shape(
         assert block_shape is not None
         expected = (a.shape[0], cdiv(a.shape[1], block_shape[1]))
         assert a_scale.shape == expected, f"{a_scale.shape} == {expected}"
-
-
-def extract_required_args(
-    extra_args: Optional[dict[str, Any]],
-    required_keys: list[str],
-) -> tuple[Any, ...]:
-    if extra_args is None:
-        raise ValueError("`extra_args` must be provided.")
-
-    missing_keys = [k for k in required_keys if k not in extra_args]
-    if missing_keys:
-        raise ValueError(f"Missing keys in `extra_args`: {missing_keys}")
-
-    return tuple(extra_args[k] for k in required_keys)
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index a9e967e608e9..fb285413ba9e 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -241,7 +241,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
-                return AWQMoEMethod(quant_args_marlin)
+                return AWQMoEMethod(quant_args_marlin, layer.moe)
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
 
@@ -339,7 +339,7 @@ def apply_gptq_quant_layer(self,
                 }
                 return MoeWNA16Config.from_config(config).get_quant_method(
                     layer, prefix)
-            return GPTQMarlinMoEMethod(quant_args_marlin)
+            return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
 
         if isinstance(layer, (LinearBase, ParallelLMHead)):
             if use_marlin:
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index fe42e26a1706..af602eb9aca3 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -113,7 +113,7 @@ def get_quant_method(
             }
             awq_marlin_config = AWQMarlinConfig.from_config(
                 marlin_compatible_config_dict)
-            return AWQMoEMethod(awq_marlin_config)
+            return AWQMoEMethod(awq_marlin_config, layer.moe_config)
         return None
 
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index ed7ffb21e85a..287d66b06d6e 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -10,7 +10,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
+    FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
     UnquantizedFusedMoEMethod)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
@@ -151,7 +151,7 @@ def get_quant_method(self, layer: torch.nn.Module,
                     "Falling back to Moe WNA16 kernels.")
                 return MoeWNA16Config.from_config(
                     self.full_config).get_quant_method(layer, prefix)
-            return AWQMoEMethod(self)
+            return AWQMoEMethod(self, layer.moe_config)
         return None
 
     @classmethod
@@ -328,7 +328,12 @@ def apply(
 
 class AWQMoEMethod(FusedMoEMethodBase):
 
-    def __init__(self, quant_config: AWQMarlinConfig):
+    def __init__(
+        self,
+        quant_config: AWQMarlinConfig,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.quant_config = quant_config
         if self.quant_config.weight_bits != 4:
             raise ValueError("AWQMoEMethod only supports 4bit now.")
@@ -500,6 +505,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `AWQMoEMethod` yet.")
@@ -516,7 +523,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
@@ -535,4 +543,4 @@ def apply(
             expert_map=expert_map,
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
-            workspace=layer.workspace)
\ No newline at end of file
+            workspace=layer.workspace)
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 0204ff46852f..b7897a43793c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -7,6 +7,7 @@
 from packaging import version
 
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEConfig,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
@@ -132,7 +133,7 @@ def get_quant_method(
                 return UnquantizedLinearMethod()
             return BitsAndBytesLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return BitsAndBytesMoEMethod(self)
+            return BitsAndBytesMoEMethod(self, layer.moe_config)
         return None
 
 
@@ -411,7 +412,12 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
        quant_config: The BitsAndBytes quantization config.
     """
 
-    def __init__(self, quant_config: BitsAndBytesConfig):
+    def __init__(
+        self,
+        quant_config: BitsAndBytesConfig,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         try:
             import bitsandbytes
             if version.parse(
@@ -422,7 +428,6 @@ def __init__(self, quant_config: BitsAndBytesConfig):
             raise ImportError("Please install bitsandbytes>=0.46.1 via "
                               "`pip install bitsandbytes>=0.46.1` to use "
                               "bitsandbytes quantizer.") from err
-        self.topk_indices_dtype = None
         self.quant_config = quant_config
 
     def create_weights(
@@ -470,6 +475,7 @@ def apply(
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
+        assert self.fused_experts is None
 
         if enable_eplb:
             raise NotImplementedError(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 839942beaf40..42c43cbc03e5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -11,20 +11,21 @@
                                              QuantizationStrategy)
 
 import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
     FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
     FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
     FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa
-    FlashInferCutlassMoEPrepareAndFinalize)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    is_valid_flashinfer_cutlass_fused_moe)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
-    build_flashinfer_fp4_cutlass_moe_kernel,
-    flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
+    build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
+    select_nvfp4_gemm_impl)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_moe_marlin_supports_layer, marlin_make_workspace_new,
     marlin_moe_permute_scales)
@@ -58,6 +59,9 @@ class GPTQMarlinState(Enum):
 
 class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 
+    def __init_(self, moe: FusedMoEConfig):
+        super().__init__(moe)
+
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
@@ -81,18 +85,22 @@ def get_moe_method(
                         "WNA16MoE is not supported with actorder=group/dynamic."
                     )
                 logger.info_once("Using CompressedTensorsWNA16MoEMethod")
-                return CompressedTensorsWNA16MoEMethod(quant_config)
+                return CompressedTensorsWNA16MoEMethod(quant_config,
+                                                       layer.moe_config)
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
-                return CompressedTensorsWNA16MarlinMoEMethod(quant_config)
+                return CompressedTensorsWNA16MarlinMoEMethod(
+                    quant_config, layer.moe_config)
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4MoeMethod()
+            return CompressedTensorsW4A4MoeMethod(layer.moe_config, layer)
         elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
               or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
               or quant_config._is_fp8_w8a8(weight_quant, input_quant)):
-            return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
+            return CompressedTensorsW8A8Fp8MoEMethod(quant_config,
+                                                     layer.moe_config)
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8Int8MoEMethod(quant_config)
+            return CompressedTensorsW8A8Int8MoEMethod(quant_config,
+                                                      layer.moe_config)
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}")
@@ -100,15 +108,16 @@ def get_moe_method(
 
 class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
 
-    def __init__(self):
+    def __init__(self, moe: FusedMoEConfig, layer: torch.nn.Module):
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support)
+        super().__init__(moe)
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
-        self.fused_experts = None  # type: ignore[assignment]
+        self.layer = layer
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -265,19 +274,36 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w2_input_scale_quant = torch.nn.Parameter(
             (layer.w2_input_global_scale), requires_grad=False)
 
-    def maybe_swap_experts_impl(self, moe_parallel_config):
+    def maybe_make_prepare_finalize(
+        self,
+        moe: FusedMoEConfig,
+    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
         if not self.allow_flashinfer:
-            return
-        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
-            moe_parallel_config)
+            return super().maybe_make_prepare_finalize(moe)
 
-    def select_gemm_impl(self, prepare_finalize, moe):
-        """Return the appropriate GEMM experts implementation."""
-        assert moe is not None and prepare_finalize is not None
-        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
-            select_nvfp4_gemm_impl)
+        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+            moe,
+            a1_gscale=self.layer.w13_input_scale_quant,
+        )
+        logger.debug_once("%s", prepare_finalize.__class__.__name__)
+        return prepare_finalize
 
-        return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger)
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        """Return the appropriate GEMM experts implementation."""
+        experts = select_nvfp4_gemm_impl(
+            moe,
+            g1_alphas=self.layer.g1_alphas,
+            g2_alphas=self.layer.g2_alphas,
+            a1_gscale=self.layer.w13_input_scale_quant,
+            a2_gscale=self.layer.w2_input_scale_quant,
+            allow_flashinfer=self.allow_flashinfer,
+        )
+        logger.debug_once("Using %s", experts.__class__.__name__)
+        return experts
 
     def apply(
         self,
@@ -301,6 +327,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for "
                                       "`CompressedTensorsW4A4MoeMethod` yet.")
@@ -317,6 +345,7 @@ def apply(
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
@@ -340,15 +369,22 @@ def apply(
 
         # FlashInfer fused experts path
         if self.fused_experts is not None:
-            return flashinfer_fp4_cutlass_moe_forward(
-                self.fused_experts,
-                layer,
-                x,
-                topk_weights,
-                topk_ids,
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            return self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
@@ -376,7 +412,6 @@ def apply(
             n=layer.w2_weight.shape[2] * 2,
             k=x.shape[1],
             e=layer.w13_weight.shape[0],
-            device=x.device,
             apply_router_weight_on_input=apply_router_weight_on_input).to(
                 x.dtype)
 
@@ -384,15 +419,16 @@ def apply(
 class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
     ):
+        super().__init__(moe)
         self.quant_config = quant_config
         self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
             "weights")
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations")
-        self.topk_indices_dtype = None
 
         per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
                       and self.input_quant.strategy
@@ -429,7 +465,6 @@ def __init__(
             self.weight_quant, self.input_quant)
         self.use_cutlass = (quant_config._is_fp8_w8a8_sm90(
             self.weight_quant, self.input_quant) or self.is_fp8_w8a8_sm100)
-        self.fused_experts = None  # type: ignore[assignment]
         self.disable_expert_map = False
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -614,25 +649,31 @@ def select_gemm_impl(
     ) -> FusedMoEPermuteExpertsUnpermute:
         # cutlass path
         if self.use_cutlass:
-            from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8
+            from vllm.model_executor.layers.fused_moe import (
+                CutlassBatchedExpertsFp8, CutlassExpertsFp8)
 
-            use_batched_format = (prepare_finalize.activation_format ==
-                                  FusedMoEActivationFormat.BatchedExperts)
+            experts: FusedMoEPermuteExpertsUnpermute
 
             num_dispatchers = prepare_finalize.num_dispatchers()
-            num_experts = (moe.num_local_experts
-                           if use_batched_format else moe.num_experts)
-
-            logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
 
-            experts = CutlassExpertsFp8(
-                num_experts,
-                moe.in_dtype,
-                self.input_quant.strategy == QuantizationStrategy.TOKEN,
-                self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
-                num_dispatchers=num_dispatchers,
-                use_batched_format=use_batched_format,
-            )
+            if (prepare_finalize.activation_format ==
+                    FusedMoEActivationFormat.BatchedExperts):
+                logger.debug("CutlassBatchedExpertsFp8(%s)",
+                             self.__class__.__name__)
+                experts = CutlassBatchedExpertsFp8(
+                    moe.num_local_experts,
+                    num_dispatchers,
+                    moe.in_dtype,
+                    self.input_quant.strategy == QuantizationStrategy.TOKEN,
+                    self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+                )
+            else:
+                logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
+                experts = CutlassExpertsFp8(
+                    moe.in_dtype,
+                    self.input_quant.strategy == QuantizationStrategy.TOKEN,
+                    self.weight_quant.strategy == QuantizationStrategy.CHANNEL,
+                )
 
             self.disable_expert_map = (num_dispatchers > 1
                                        or not experts.supports_expert_map())
@@ -834,9 +875,11 @@ def apply(
 class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
     ):
+        super().__init__(moe)
         self.quant_config = quant_config
         self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
             "weights")
@@ -934,6 +977,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for "
@@ -951,7 +996,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return fused_experts(
             hidden_states=x,
@@ -975,9 +1021,11 @@ def apply(
 class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
     ):
+        super().__init__(moe)
         self.quant_config = quant_config
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -1233,6 +1281,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for "
@@ -1251,7 +1301,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
@@ -1279,9 +1330,11 @@ def apply(
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
-            self,
-            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        self,
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        moe: FusedMoEConfig,
     ):
+        super().__init__(moe)
         self.quant_config = quant_config
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -1459,6 +1512,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError("EPLB not supported for "
                                       "`CompressedTensorsWNA16MoEMethod` yet.")
@@ -1475,7 +1530,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return fused_experts(
             x,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 47eca80609e0..3e43caa4cbf7 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -6,7 +6,8 @@
 import torch
 
 from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
-from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
+                                                  FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -46,13 +47,18 @@ def get_quant_method(self, layer: torch.nn.Module,
         if isinstance(layer, LinearBase):
             return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
-            return ExpertsInt8MoEMethod(self)
+            return ExpertsInt8MoEMethod(self, layer.moe_config)
         return None
 
 
 class ExpertsInt8MoEMethod(FusedMoEMethodBase):
 
-    def __init__(self, quant_config: ExpertsInt8Config):
+    def __init__(
+        self,
+        quant_config: ExpertsInt8Config,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -122,6 +128,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ExpertsInt8MoEMethod` yet.")
@@ -138,7 +146,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return fused_experts(
             x,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index dbd523428695..a49744913251 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import functools
 from typing import TYPE_CHECKING, Any, Callable, Optional
 
 import torch
@@ -142,7 +141,7 @@ def get_quant_method(self, layer: torch.nn.Module,
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return Fp8MoEMethod(self)
+            return Fp8MoEMethod(self, layer.moe_config)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
@@ -479,9 +478,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         quant_config: The quantization config.
     """
 
-    def __init__(self, quant_config: Fp8Config):
-
-        from vllm.model_executor.layers.fused_moe import fused_experts
+    def __init__(self, quant_config: Fp8Config, moe: FusedMoEConfig):
+        super().__init__(moe)
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
 
@@ -529,15 +527,6 @@ def __init__(self, quant_config: Fp8Config):
                 "CutlassBlockScaledGroupedGemm not supported on the current "
                 "platform.")
 
-        self.topk_indices_dtype = None
-        self.fused_experts = functools.partial(  # type: ignore
-            fused_experts,
-            use_fp8_w8a8=True,
-            block_shape=self.quant_config.weight_block_size,
-            allow_deep_gemm=self.allow_deep_gemm,
-            allow_cutlass_block_scaled_grouped_gemm=(
-                self.allow_cutlass_block_scaled_grouped_gemm))
-
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -1033,7 +1022,7 @@ def apply(
                     num_expert_group=num_expert_group,
                     topk_group=topk_group,
                     apply_router_weight_on_input=apply_router_weight_on_input)
-        else:
+        elif self.fused_experts is not None:
             return self.fused_experts(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -1052,6 +1041,30 @@ def apply(
                 a1_scale=layer.w13_input_scale,
                 a2_scale=layer.w2_input_scale,
             )
+        else:
+            from vllm.model_executor.layers.fused_moe import fused_experts
+            return fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=True,
+                activation=activation,
+                global_num_experts=global_num_experts,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                expert_map=expert_map,
+                w1_scale=(layer.w13_weight_scale_inv
+                          if self.block_quant else layer.w13_weight_scale),
+                w2_scale=(layer.w2_weight_scale_inv
+                          if self.block_quant else layer.w2_weight_scale),
+                a1_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+                use_fp8_w8a8=True,
+                block_shape=self.quant_config.weight_block_size,
+                allow_deep_gemm=self.allow_deep_gemm,
+                allow_cutlass_block_scaled_grouped_gemm=(
+                    self.allow_cutlass_block_scaled_grouped_gemm))
 
 
 class Fp8KVCacheMethod(BaseKVCacheMethod):
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 86da04c39989..49d28927d6e7 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -11,6 +11,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEConfig,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -58,7 +59,7 @@ def get_quant_method(self, layer: torch.nn.Module,
         elif isinstance(layer, VocabParallelEmbedding):
             return GGUFEmbeddingMethod(self)
         elif isinstance(layer, FusedMoE):
-            return GGUFMoEMethod(self)
+            return GGUFMoEMethod(self, layer.moe_config)
         return None
 
 
@@ -445,7 +446,12 @@ class GGUFMoEMethod(FusedMoEMethodBase):
         quant_config: The GGUF quantization config.
     """
 
-    def __init__(self, quant_config: GGUFConfig):
+    def __init__(
+        self,
+        quant_config: GGUFConfig,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -525,6 +531,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ):
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `GGUFMoEMethod` yet.")
@@ -545,7 +553,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
         return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
                               topk_weights, topk_ids,
                               layer.w13_qweight_type.weight_type,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 3299221e3af3..bd14ab9ef6c6 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -10,7 +10,7 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
+    FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported,
     UnquantizedFusedMoEMethod)
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
@@ -375,7 +375,12 @@ def apply(
 class GPTQMarlinMoEMethod(FusedMoEMethodBase):
     """MoE Marlin method with quantization."""
 
-    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+    def __init__(
+        self,
+        quant_config: GPTQMarlinConfig,
+        moe: FusedMoEConfig,
+    ) -> None:
+        super().__init__(moe)
         self.quant_config = quant_config
         if self.quant_config.quant_type.size_bits == 4:
             self.quant_type = scalar_types.uint4b8
@@ -646,6 +651,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `GPTQMarlinMoEMethod` yet.")
@@ -662,7 +669,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return torch.ops.vllm.fused_marlin_moe(
             x,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 22fbbab00e91..e0f462b36976 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -12,7 +12,9 @@
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    is_valid_flashinfer_cutlass_fused_moe)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -22,8 +24,8 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
-    build_flashinfer_fp4_cutlass_moe_kernel,
-    flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1)
+    build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
+    select_nvfp4_gemm_impl)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
     rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
@@ -177,7 +179,7 @@ def get_quant_method(self, layer: torch.nn.Module,
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
-            return ModelOptFp8MoEMethod(self)
+            return ModelOptFp8MoEMethod(self, layer.moe_config)
         return None
 
 
@@ -273,7 +275,12 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
         quant_config: The ModelOpt quantization config.
     """
 
-    def __init__(self, quant_config: ModelOptFp8Config) -> None:
+    def __init__(
+        self,
+        quant_config: ModelOptFp8Config,
+        moe: FusedMoEConfig,
+    ) -> None:
+        super().__init__(moe)
         self.quant_config = quant_config
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
             cutlass_fp8_supported)
@@ -454,6 +461,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
@@ -484,6 +493,7 @@ def apply(
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
         )
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_experts)
@@ -699,7 +709,7 @@ def get_quant_method(self, layer: torch.nn.Module,
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
-            return ModelOptNvFp4FusedMoE(self)
+            return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer)
         return None
 
 
@@ -923,10 +933,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
         quant_config: NVFP4 Quant Config
     """
 
-    def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
-        self.quant_config = quant_config
+    def __init__(
+        self,
+        quant_config: ModelOptNvFp4Config,
+        moe: FusedMoEConfig,
+        layer: torch.nn.Module,
+    ) -> None:
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support)
+        super().__init__(moe)
+        self.quant_config = quant_config
+        self.layer = layer
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
         self.allow_flashinfer = _nvfp4.allow_flashinfer
@@ -952,27 +969,35 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.fused_experts: Optional[
             mk.FusedMoEModularKernel] = None  # type: ignore[assignment]
 
-    def maybe_swap_experts_impl(
+    def maybe_make_prepare_finalize(
         self,
-        moe_parallel_config: FusedMoEParallelConfig,
-    ):
+        moe: FusedMoEConfig,
+    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
         if not self.allow_flashinfer:
-            return
-        self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel(
-            moe_parallel_config)
+            return super().maybe_make_prepare_finalize(moe)
 
-    # This method update self.fused_experts
-    # only prepare_finalize is not None call select_gemm_impl
-    # so when native cutlass fp4, fused_expert is in fuse_moe.py fused_expert
-    # when it's not called(TP case), we still have 2 kernels to use.
-    def select_gemm_impl(self, prepare_finalize,
-                         moe) -> mk.FusedMoEPermuteExpertsUnpermute:
-
-        assert moe is not None and prepare_finalize is not None
-        from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (  # noqa: E501
-            select_nvfp4_gemm_impl)
+        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+            moe,
+            a1_gscale=self.layer.w13_input_scale_quant,
+        )
+        logger.debug_once("%s", prepare_finalize.__class__.__name__)
+        return prepare_finalize
 
-        return select_nvfp4_gemm_impl(self.allow_flashinfer, moe, logger)
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        experts = select_nvfp4_gemm_impl(
+            moe,
+            g1_alphas=self.layer.g1_alphas,
+            g2_alphas=self.layer.g2_alphas,
+            a1_gscale=self.layer.w13_input_scale_quant,
+            a2_gscale=self.layer.w2_input_scale_quant,
+            allow_flashinfer=self.allow_flashinfer,
+        )
+        logger.debug_once("Using %s", experts.__class__.__name__)
+        return experts
 
     def uses_weight_scale_2_pattern(self) -> bool:
         """
@@ -1362,7 +1387,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         if self.use_marlin:
             return torch.ops.vllm.fused_marlin_moe(
@@ -1404,21 +1430,28 @@ def apply(
                 n=layer.w2_weight.shape[2] * 2,
                 k=x.shape[1],
                 e=layer.w13_weight.shape[0],
-                device=x.device,
                 expert_map=expert_map,
                 apply_router_weight_on_input=apply_router_weight_on_input)
         else:
             assert self.allow_flashinfer and \
                self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-            out = flashinfer_fp4_cutlass_moe_forward(
-                self.fused_experts,
-                layer,
-                x,
-                topk_weights,
-                topk_ids,
+
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            out = self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
                 activation=activation,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index c5055a02fa3d..364d1ac314d2 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -7,7 +7,7 @@
 
 from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+    FusedMoE, FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -160,7 +160,7 @@ def get_quant_method(self, layer: torch.nn.Module,
             else:
                 raise ValueError("moe_wna16 only support gptq and awq.")
         elif isinstance(layer, FusedMoE):
-            return MoeWNA16Method(self)
+            return MoeWNA16Method(self, layer.moe_config)
         return None
 
 
@@ -175,7 +175,12 @@ class MoeWNA16Method(FusedMoEMethodBase):
         quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
     """
 
-    def __init__(self, quant_config: MoeWNA16Config):
+    def __init__(
+        self,
+        quant_config: MoeWNA16Config,
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -302,6 +307,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `MoeWNA16Method` yet.")
@@ -318,7 +325,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         weight_bits = self.quant_config.weight_bits
         has_zp = self.quant_config.has_zp
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index dbe6c603c062..3c5d83037cde 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -82,7 +82,7 @@ def get_quant_method(self, layer: torch.nn.Module,
 class Mxfp4MoEMethod(FusedMoEMethodBase):
 
     def __init__(self, moe: FusedMoEConfig):
-        super().__init__()
+        super().__init__(moe)
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 6f69210d0861..58f56c6381b3 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -7,7 +7,8 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
+                                                  FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     OCP_MX_BLOCK_SIZE)
@@ -25,6 +26,9 @@
 
 class QuarkMoEMethod(FusedMoEMethodBase):
 
+    def __init__(self, moe: FusedMoEConfig):
+        super().__init__(moe)
+
     @staticmethod
     def get_moe_method(
             quant_config: "QuarkConfig",  # type: ignore # noqa E501 # noqa F821
@@ -42,17 +46,24 @@ def get_moe_method(
         input_config = layer_quant_config.get("input_tensors")
 
         if quant_config._is_fp8_w8a8(weight_config, input_config):
-            return QuarkW8A8Fp8MoEMethod(weight_config, input_config)
+            return QuarkW8A8Fp8MoEMethod(weight_config, input_config,
+                                         module.moe_config)
         elif quant_config._is_mx_fp4(weight_config, input_config):
-            return QuarkW4A4MXFp4MoEMethod(weight_config, input_config)
+            return QuarkW4A4MXFp4MoEMethod(weight_config, input_config,
+                                           module.moe_config)
         else:
             raise RuntimeError("Unsupported FusedMoe scheme")
 
 
 class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
 
-    def __init__(self, weight_config: dict[str, Any], input_config: dict[str,
-                                                                         Any]):
+    def __init__(
+        self,
+        weight_config: dict[str, Any],
+        input_config: dict[str, Any],
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.weight_quant = weight_config
         self.input_quant = input_config
 
@@ -215,6 +226,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet.")
@@ -231,7 +244,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         return fused_experts(
             x,
@@ -253,8 +267,13 @@ def apply(
 
 class QuarkW4A4MXFp4MoEMethod(QuarkMoEMethod):
 
-    def __init__(self, weight_config: dict[str, Any], input_config: dict[str,
-                                                                         Any]):
+    def __init__(
+        self,
+        weight_config: dict[str, Any],
+        input_config: dict[str, Any],
+        moe: FusedMoEConfig,
+    ):
+        super().__init__(moe)
         self.weight_quant = weight_config
         self.input_quant = input_config
 
@@ -369,6 +388,7 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
 
         if enable_eplb:
             raise NotImplementedError(
@@ -386,7 +406,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         out = fused_experts(
             x,
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index cceaf9857c40..8bdb50e07b13 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -10,7 +10,8 @@
 from torch.nn.parameter import Parameter
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
+from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
+                                                  FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -76,7 +77,7 @@ def get_quant_method(self, layer: torch.nn.Module,
         if isinstance(layer, LinearBase):
             return RTNLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return RTNMoEMethod(self)
+            return RTNMoEMethod(self, layer.moe_config)
         return None
 
 
@@ -210,7 +211,8 @@ def apply(self,
 
 class RTNMoEMethod(FusedMoEMethodBase):
 
-    def __init__(self, quant_config: RTNConfig):
+    def __init__(self, quant_config: RTNConfig, moe: FusedMoEConfig):
+        super().__init__(moe)
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -289,6 +291,8 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        assert self.fused_experts is None
+
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `RTNMoEMethod` yet.")
@@ -305,7 +309,8 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
-            e_score_correction_bias=e_score_correction_bias)
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype)
 
         weight_bits = self.quant_config.weight_bits
         group_size = self.quant_config.group_size
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index 8ef91eeed406..f5d7c57fe2a8 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -3,33 +3,30 @@
 """Utility helpers for NVFP4 + FlashInfer fused-MoE path"""
 from __future__ import annotations
 
-from typing import Optional
-
 import torch
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-    FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe)
+    FlashInferExperts)
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
     FlashInferCutlassMoEPrepareAndFinalize)
 from vllm.platforms import current_platform
-
-logger = init_logger(__name__)
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
 __all__ = [
     "is_flashinfer_fp4_cutlass_moe_available",
     "reorder_w1w3_to_w3w1",
-    "build_flashinfer_fp4_cutlass_moe_kernel",
-    "flashinfer_fp4_cutlass_moe_forward",
+    "build_flashinfer_fp4_cutlass_moe_prepare_finalize",
 ]
 
 
 def is_flashinfer_fp4_cutlass_moe_available() -> bool:
     """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
-    return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda()
+    return (envs.VLLM_USE_FLASHINFER_MOE_FP4
+            and has_flashinfer_cutlass_fused_moe()
+            and current_platform.is_cuda()
             and current_platform.is_device_capability(100))
 
 
@@ -49,105 +46,33 @@ def reorder_w1w3_to_w3w1(weight: torch.Tensor,
                                                        dim=dim).contiguous())
 
 
-def build_flashinfer_fp4_cutlass_moe_kernel(
-    moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel:
-    """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel"""
-    experts = FlashInferExperts(
-        use_nvfp4_w4a4=True,
-        use_dp=moe_parallel_config.dp_size > 1,
-        ep_rank=moe_parallel_config.ep_rank,
-        ep_size=moe_parallel_config.ep_size,
-        tp_rank=moe_parallel_config.tp_rank,
-        tp_size=moe_parallel_config.tp_size,
-    )
-    logger.debug_once("FlashInferExperts (util)")
-    return mk.FusedMoEModularKernel(
-        FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8),
-        experts,
-    )
-
-
-def flashinfer_fp4_cutlass_moe_forward(
-    fused_experts: mk.FusedMoEModularKernel,
-    layer: torch.nn.Module,
-    x: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    activation: str,
-    global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    apply_router_weight_on_input: bool,
-) -> torch.Tensor:
-    """Common forward wrapper for FlashInfer NV-FP4 fused-MoE"""
-
-    assert is_valid_flashinfer_cutlass_fused_moe(
-        x, layer.w13_weight,
-        layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!")
-
-    a1_gscale = layer.w13_input_scale_quant
-    a2_gscale = layer.w2_input_scale_quant
-
-    extra_expert_args = {
-        "g1_alphas": layer.g1_alphas,
-        "g2_alphas": layer.g2_alphas,
-        # Avoid confusion with a1_scale and a2_scale
-        # where are batch size related.
-        "a1_gscale": a1_gscale,
-        "a2_gscale": a2_gscale,
-        "out_dtype": x.dtype,
-    }
-    extra_prepare_args = {
-        "use_dp": layer.dp_size > 1,
-        "local_tokens": x.shape[0],
-        "a1_gscale": a1_gscale,
-    }
-    extra_finalize_args = {
-        "use_dp": layer.dp_size > 1,
-        "local_tokens": x.shape[0],
-    }
-
-    return fused_experts(
-        hidden_states=x,
-        w1=layer.w13_weight,
-        w2=layer.w2_weight,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        inplace=False,  # TODO(shuw): fix later, now output is high prec
-        activation=activation,
-        global_num_experts=global_num_experts,
-        expert_map=expert_map,
-        w1_scale=layer.w13_blockscale_swizzled,
-        w2_scale=layer.w2_blockscale_swizzled,
-        apply_router_weight_on_input=apply_router_weight_on_input,
-        extra_expert_args=extra_expert_args,
-        extra_prepare_args=extra_prepare_args,
-        extra_finalize_args=extra_finalize_args,
-    )
+def build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+    moe: FusedMoEConfig,
+    a1_gscale: torch.Tensor,
+) -> mk.FusedMoEPrepareAndFinalize:
+    """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
+    use_dp = moe.moe_parallel_config.dp_size > 1
+    return FlashInferCutlassMoEPrepareAndFinalize(use_dp, a1_gscale=a1_gscale)
 
 
 def select_nvfp4_gemm_impl(
-        allow_flashinfer: bool,
-        moe,  # FusedMoEConfig
-        logger):
+    moe: FusedMoEConfig,
+    g1_alphas: torch.Tensor,
+    g2_alphas: torch.Tensor,
+    a1_gscale: torch.Tensor,
+    a2_gscale: torch.Tensor,
+    allow_flashinfer: bool,
+) -> mk.FusedMoEPermuteExpertsUnpermute:
     """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
 
-    # lazy import
-    from vllm.distributed import get_ep_group
-
-    all2all_manager = get_ep_group().device_communicator.all2all_manager
-    assert all2all_manager is not None
-
     if allow_flashinfer:
-        flashinfer_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-        if flashinfer_backend != "throughput":
-            raise ValueError(
-                f"Only throughput backend is supported for FlashInferExperts, "
-                f"but got {flashinfer_backend}.")
-        logger.debug_once(
-            "Initializing FlashInferExperts with throughput backend.")
         return FlashInferExperts(
-            use_nvfp4_w4a4=True,
-            use_dp=moe.moe_parallel_config.dp_size > 1,
+            g1_alphas=g1_alphas,
+            g2_alphas=g2_alphas,
+            a1_gscale=a1_gscale,
+            a2_gscale=a2_gscale,
+            out_dtype=moe.in_dtype,
+            quant_dtype="nvfp4",
             ep_rank=moe.moe_parallel_config.ep_rank,
             ep_size=moe.moe_parallel_config.ep_size,
             tp_rank=moe.moe_parallel_config.tp_rank,

From 26dc380dfef1b330022e38276572da7b7269ee21 Mon Sep 17 00:00:00 2001
From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:47:56 -0400
Subject: [PATCH 103/233] [Model] Granite-4 support loading quantized
 checkpoint (#22925)

Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
---
 vllm/model_executor/models/granitemoehybrid.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 5704496b9a5d..f451e65338b7 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -471,7 +471,10 @@ def _load_expert(n, p, name, shard_id, expert_id):
             # Mapping different experts' layout:
             #  from HF (input_linear, output_linear, router)
             #  to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
-            if n.endswith('.block_sparse_moe.input_linear.weight'):
+            # The renaming and parameter loading logic is the same for weight
+            # and weight_scale tensors so we can reuse them without issues.
+            if (n.endswith('.block_sparse_moe.input_linear.weight') or
+                    n.endswith('.block_sparse_moe.input_linear.weight_scale')):
                 for e in range(p.size(0)):
                     w1_name = n.replace(
                         '.block_sparse_moe.input_linear.weight',
@@ -490,7 +493,8 @@ def _load_expert(n, p, name, shard_id, expert_id):
                                  w3_name,
                                  shard_id='w3',
                                  expert_id=e)
-            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+            elif (n.endswith('.block_sparse_moe.output_linear.weight') or
+                  n.endswith('.block_sparse_moe.output_linear.weight_scale')):
                 for e in range(p.size(0)):
                     w2_name = n.replace(
                         '.block_sparse_moe.output_linear.weight',

From 1d15d00c17eda6954e20f10edbc75ad4a895900a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:51:50 -0400
Subject: [PATCH 104/233] [Log] Debug Once for Randomizing dummy data for DP
 Rank (#22860)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3ea39dc519d8..bef67486d518 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2192,7 +2192,7 @@ def rand_input_ids() -> torch.Tensor:
                     high=self.model_config.get_vocab_size(),
                     dtype=input_ids.dtype)
 
-            logger.debug("Randomizing dummy data for DP Rank")
+            logger.debug_once("Randomizing dummy data for DP Rank")
             input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
                             non_blocking=True)
             yield

From 0a09281a7a7df5f410c93769ef29e7ffc7676c27 Mon Sep 17 00:00:00 2001
From: Zebing Lin <linzebing1995@gmail.com>
Date: Fri, 15 Aug 2025 15:12:12 -0400
Subject: [PATCH 105/233] [Core] direct indexing on self.block_table_np in
 compute_slot_mapping (#22940)

Signed-off-by: linzebing <linzebing1995@gmail.com>
---
 vllm/v1/worker/block_table.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index bf38e88f0c2a..5662fc350e19 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -91,8 +91,7 @@ def compute_slot_mapping(self, req_indices: np.ndarray,
         # block_size.
         block_table_indices = (req_indices * self.max_num_blocks_per_req +
                                positions // self.block_size)
-        block_table_cpu = self.get_cpu_tensor()
-        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
+        block_numbers = self.block_table_np.ravel()[block_table_indices]
         block_offsets = positions % self.block_size
         np.add(block_numbers * self.block_size,
                block_offsets,

From b2e05e37b3d34678b409492a5686fec74438ac65 Mon Sep 17 00:00:00 2001
From: nvjullin <jullin@nvidia.com>
Date: Sat, 16 Aug 2025 04:08:37 +0800
Subject: [PATCH 106/233] [Bugfix] Added more env vars to hash (#22449)

Signed-off-by: Julien Lin <jullin@nvidia.com>
---
 vllm/envs.py | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 4c1e57535c7e..6b9e05244c1d 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1206,14 +1206,6 @@ def compute_hash() -> str:
     affect the choice of different kernels or attention backends should
     also be included in the factors list.
     """
-    factors: list[Any] = []
-
-    # summarize environment variables
-    def factorize(name: str):
-        if __getattr__(name):
-            factors.append(__getattr__(name))
-        else:
-            factors.append("None")
 
     # The values of envs may affects the computation graph.
     # TODO(DefTruth): hash all environment variables?
@@ -1228,11 +1220,45 @@ def factorize(name: str):
         "VLLM_DP_SIZE",
         "VLLM_USE_STANDALONE_COMPILE",
         "VLLM_FUSED_MOE_CHUNK_SIZE",
+        "VLLM_FLASHINFER_MOE_BACKEND",
+        "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
+        "VLLM_USE_AITER_UNIFIED_ATTENTION",
+        "VLLM_ATTENTION_BACKEND",
+        "VLLM_USE_FLASHINFER_SAMPLER",
+        "VLLM_FLASHINFER_FORCE_TENSOR_CORES",
+        "VLLM_DISABLED_KERNELS",
+        "VLLM_USE_DEEP_GEMM",
         "VLLM_USE_TRTLLM_FP4_GEMM",
+        "VLLM_USE_FLASHINFER_MOE_FP8",
+        "VLLM_USE_FLASHINFER_MOE_FP4",
+        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
+        "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
+        "VLLM_USE_CUDNN_PREFILL",
+        "VLLM_USE_TRTLLM_ATTENTION",
+        "VLLM_ROCM_USE_AITER",
+        "VLLM_ROCM_USE_AITER_PAGED_ATTN",
+        "VLLM_ROCM_USE_AITER_LINEAR",
+        "VLLM_ROCM_USE_AITER_MOE",
+        "VLLM_ROCM_USE_AITER_RMSNORM",
+        "VLLM_ROCM_USE_AITER_MLA",
+        "VLLM_ROCM_USE_AITER_MHA",
+        "VLLM_ROCM_USE_SKINNY_GEMM",
+        "VLLM_ROCM_FP8_PADDING",
+        "VLLM_ROCM_MOE_PADDING",
+        "VLLM_ROCM_CUSTOM_PAGED_ATTN",
+        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
+        "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16",
+        "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB",
     ]
     for key in environment_variables_to_hash:
-        if key in environment_variables:
-            factorize(key)
+        # if this goes out of sync with environment_variables,
+        # it's not a user error, it's a bug
+        assert key in environment_variables, \
+            "Please update environment_variables_to_hash in envs.py"
+
+    factors = [
+        environment_variables[key]() for key in environment_variables_to_hash
+    ]
 
     hash_str = hashlib.md5(str(factors).encode(),
                            usedforsecurity=False).hexdigest()

From a55302af1621fa322e33da338f98feeee3f30697 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 15 Aug 2025 16:54:20 -0400
Subject: [PATCH 107/233] Use regex in convert-results-json-to-markdown.py
 (#22989)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 .../scripts/convert-results-json-to-markdown.py                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 496ee6083abd..77047636bb95 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -4,7 +4,6 @@
 import argparse
 import json
 import os
-import re
 import shlex
 from importlib import util
 from pathlib import Path
@@ -12,6 +11,7 @@
 
 import pandas as pd
 import psutil
+import regex as re
 from tabulate import tabulate
 
 # latency results and the keys that will be printed into markdown

From 8f4c570211269f9a7e56158df39fe5003aca727e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 15 Aug 2025 16:56:31 -0400
Subject: [PATCH 108/233] [CI] Speed up Whisper tests by reusing server
 (#22859)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../openai/test_transcription_validation.py   | 320 ++++++++----------
 .../openai/test_translation_validation.py     | 234 +++++++------
 2 files changed, 263 insertions(+), 291 deletions(-)

diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index e103bd206b54..93239f41a4ae 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -4,19 +4,20 @@
 # imports for guided decoding tests
 import io
 import json
-from unittest.mock import patch
 
 import librosa
 import numpy as np
 import openai
 import pytest
+import pytest_asyncio
 import soundfile as sf
-from openai._base_client import AsyncAPIClient
 
 from vllm.assets.audio import AudioAsset
 
 from ...utils import RemoteOpenAIServer
 
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
     "--tokenizer_mode", "mistral", "--config_format", "mistral",
     "--load_format", "mistral"
@@ -37,6 +38,18 @@ def winning_call():
         yield f
 
 
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
@@ -61,25 +74,33 @@ async def test_basic_audio(mary_had_lamb, model_name):
 
 
 @pytest.mark.asyncio
-async def test_bad_requests(mary_had_lamb):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+async def test_non_asr_model(winning_call):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
         client = remote_server.get_async_client()
-
-        # invalid language
-        with pytest.raises(openai.BadRequestError):
-            await client.audio.transcriptions.create(model=model_name,
-                                                     file=mary_had_lamb,
-                                                     language="hh",
-                                                     temperature=0.0)
+        res = await client.audio.transcriptions.create(model=model_name,
+                                                       file=winning_call,
+                                                       language="en",
+                                                       temperature=0.0)
+        err = res.error
+        assert err["code"] == 400 and not res.text
+        assert err[
+            "message"] == "The model does not support Transcriptions API"
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
-async def test_long_audio_request(mary_had_lamb, model_name):
-    server_args = ["--enforce-eager"]
+async def test_bad_requests(mary_had_lamb, client):
+    # invalid language
+    with pytest.raises(openai.BadRequestError):
+        await client.audio.transcriptions.create(model=MODEL_NAME,
+                                                 file=mary_had_lamb,
+                                                 language="hh",
+                                                 temperature=0.0)
+
 
+@pytest.mark.asyncio
+async def test_long_audio_request(mary_had_lamb, client):
     mary_had_lamb.seek(0)
     audio, sr = librosa.load(mary_had_lamb)
     # Add small silence after each audio for repeatability in the split process
@@ -89,188 +110,129 @@ async def test_long_audio_request(mary_had_lamb, model_name):
     buffer = io.BytesIO()
     sf.write(buffer, repeated_audio, sr, format='WAV')
     buffer.seek(0)
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=buffer,
-            language="en",
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
-        counts = out.count("Mary had a little lamb")
-        assert counts == 10, counts
-
-
-@pytest.mark.asyncio
-async def test_non_asr_model(winning_call):
-    # text to text model
-    model_name = "JackFram/llama-68m"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        res = await client.audio.transcriptions.create(model=model_name,
-                                                       file=winning_call,
-                                                       language="en",
-                                                       temperature=0.0)
-        err = res.error
-        assert err["code"] == 400 and not res.text
-        assert err[
-            "message"] == "The model does not support Transcriptions API"
+    transcription = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=buffer,
+        language="en",
+        response_format="text",
+        temperature=0.0)
+    out = json.loads(transcription)['text']
+    counts = out.count("Mary had a little lamb")
+    assert counts == 10, counts
 
 
 @pytest.mark.asyncio
-async def test_completion_endpoints():
+async def test_completion_endpoints(client):
     # text to text model
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        res = await client.chat.completions.create(
-            model=model_name,
-            messages=[{
-                "role": "system",
-                "content": "You are a helpful assistant."
-            }])
-        err = res.error
-        assert err["code"] == 400
-        assert err[
-            "message"] == "The model does not support Chat Completions API"
-
-        res = await client.completions.create(model=model_name, prompt="Hello")
-        err = res.error
-        assert err["code"] == 400
-        assert err["message"] == "The model does not support Completions API"
+    res = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }])
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Chat Completions API"
+
+    res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Completions API"
 
 
 @pytest.mark.asyncio
-async def test_streaming_response(winning_call):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
+async def test_streaming_response(winning_call, client):
     transcription = ""
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        res_no_stream = await client.audio.transcriptions.create(
-            model=model_name,
-            file=winning_call,
-            response_format="json",
-            language="en",
-            temperature=0.0)
-        # Unfortunately this only works when the openai client is patched
-        # to use streaming mode, not exposed in the transcription api.
-        original_post = AsyncAPIClient.post
-
-        async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
-            return await original_post(*args, **kwargs)
-
-        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
-            client = remote_server.get_async_client()
-            res = await client.audio.transcriptions.create(
-                model=model_name,
-                file=winning_call,
-                language="en",
-                temperature=0.0,
-                extra_body=dict(stream=True),
-                timeout=30)
-            # Reconstruct from chunks and validate
-            async for chunk in res:
-                # just a chunk
-                text = chunk.choices[0]['delta']['content']
-                transcription += text
-
-        assert transcription == res_no_stream.text
+    res_no_stream = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        response_format="json",
+        language="en",
+        temperature=0.0)
+    res = await client.audio.transcriptions.create(model=MODEL_NAME,
+                                                   file=winning_call,
+                                                   language="en",
+                                                   temperature=0.0,
+                                                   stream=True,
+                                                   timeout=30)
+    # Reconstruct from chunks and validate
+    async for chunk in res:
+        text = chunk.choices[0]['delta']['content']
+        transcription += text
+
+    assert transcription == res_no_stream.text
 
 
 @pytest.mark.asyncio
-async def test_stream_options(winning_call):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        original_post = AsyncAPIClient.post
-
-        async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
-            return await original_post(*args, **kwargs)
-
-        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
-            client = remote_server.get_async_client()
-            res = await client.audio.transcriptions.create(
-                model=model_name,
-                file=winning_call,
-                language="en",
-                temperature=0.0,
-                extra_body=dict(stream=True,
-                                stream_include_usage=True,
-                                stream_continuous_usage_stats=True),
-                timeout=30)
-            final = False
-            continuous = True
-            async for chunk in res:
-                if not len(chunk.choices):
-                    # final usage sent
-                    final = True
-                else:
-                    continuous = continuous and hasattr(chunk, 'usage')
-            assert final and continuous
+async def test_stream_options(winning_call, client):
+    res = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        extra_body=dict(stream_include_usage=True,
+                        stream_continuous_usage_stats=True),
+        timeout=30)
+    final = False
+    continuous = True
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            final = True
+        else:
+            continuous = continuous and hasattr(chunk, 'usage')
+    assert final and continuous
 
 
 @pytest.mark.asyncio
-async def test_sampling_params(mary_had_lamb):
+async def test_sampling_params(mary_had_lamb, client):
     """
     Compare sampling with params and greedy sampling to assert results
     are different when extreme sampling parameters values are picked. 
     """
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            temperature=0.8,
-            extra_body=dict(seed=42,
-                            repetition_penalty=1.9,
-                            top_k=12,
-                            top_p=0.4,
-                            min_p=0.5,
-                            frequency_penalty=1.8,
-                            presence_penalty=2.0))
-
-        greedy_transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            temperature=0.0,
-            extra_body=dict(seed=42))
-
-        assert greedy_transcription.text != transcription.text
+    transcription = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.8,
+        extra_body=dict(seed=42,
+                        repetition_penalty=1.9,
+                        top_k=12,
+                        top_p=0.4,
+                        min_p=0.5,
+                        frequency_penalty=1.8,
+                        presence_penalty=2.0))
+
+    greedy_transcription = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.0,
+        extra_body=dict(seed=42))
+
+    assert greedy_transcription.text != transcription.text
 
 
 @pytest.mark.asyncio
-async def test_audio_prompt(mary_had_lamb):
-    model_name = "openai/whisper-large-v3-turbo"
-    server_args = ["--enforce-eager"]
+async def test_audio_prompt(mary_had_lamb, client):
     prompt = "This is a speech, recorded in a phonograph."
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        #Prompts should not omit the part of original prompt while transcribing.
-        prefix = "The first words I spoke in the original phonograph"
-        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
-        assert prefix in out
-        transcription_wprompt = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
-            language="en",
-            response_format="text",
-            prompt=prompt,
-            temperature=0.0)
-        out_prompt = json.loads(transcription_wprompt)['text']
-        assert prefix in out_prompt
+    #Prompts should not omit the part of original prompt while transcribing.
+    prefix = "The first words I spoke in the original phonograph"
+    transcription = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0)
+    out = json.loads(transcription)['text']
+    assert prefix in out
+    transcription_wprompt = await client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        prompt=prompt,
+        temperature=0.0)
+    out_prompt = json.loads(transcription_wprompt)['text']
+    assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index bfa9bdef1c00..f4f5c66f2dee 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -4,18 +4,21 @@
 import io
 # imports for guided decoding tests
 import json
-from unittest.mock import patch
 
+import httpx
 import librosa
 import numpy as np
 import pytest
+import pytest_asyncio
 import soundfile as sf
-from openai._base_client import AsyncAPIClient
 
 from vllm.assets.audio import AudioAsset
 
 from ...utils import RemoteOpenAIServer
 
+MODEL_NAME = "openai/whisper-small"
+SERVER_ARGS = ["--enforce-eager"]
+
 
 @pytest.fixture
 def foscolo():
@@ -25,50 +28,23 @@ def foscolo():
         yield f
 
 
-# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
-@pytest.mark.asyncio
-async def test_basic_audio(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        translation = await client.audio.translations.create(
-            model=model_name,
-            file=foscolo,
-            response_format="text",
-            # TODO remove once language detection is implemented
-            extra_body=dict(language="it"),
-            temperature=0.0)
-        out = json.loads(translation)['text'].strip().lower()
-        assert "greek sea" in out
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
 
 
-@pytest.mark.asyncio
-async def test_audio_prompt(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    # Condition whisper on starting text
-    prompt = "Nor have I ever"
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        transcription = await client.audio.translations.create(
-            model=model_name,
-            file=foscolo,
-            prompt=prompt,
-            extra_body=dict(language="it"),
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(transcription)['text']
-        assert "Nor will I ever touch the sacred" not in out
-        assert prompt not in out
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
 
 
 @pytest.mark.asyncio
 async def test_non_asr_model(foscolo):
     # text to text model
     model_name = "JackFram/llama-68m"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
         client = remote_server.get_async_client()
         res = await client.audio.translations.create(model=model_name,
                                                      file=foscolo,
@@ -78,81 +54,117 @@ async def test_non_asr_model(foscolo):
         assert err["message"] == "The model does not support Translations API"
 
 
+# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
+@pytest.mark.asyncio
+async def test_basic_audio(foscolo, client):
+    translation = await client.audio.translations.create(
+        model=MODEL_NAME,
+        file=foscolo,
+        response_format="text",
+        # TODO remove once language detection is implemented
+        extra_body=dict(language="it"),
+        temperature=0.0)
+    out = json.loads(translation)['text'].strip().lower()
+    assert "greek sea" in out
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(foscolo, client):
+    # Condition whisper on starting text
+    prompt = "Nor have I ever"
+    transcription = await client.audio.translations.create(
+        model=MODEL_NAME,
+        file=foscolo,
+        prompt=prompt,
+        extra_body=dict(language="it"),
+        response_format="text",
+        temperature=0.0)
+    out = json.loads(transcription)['text']
+    assert "Nor will I ever touch the sacred" not in out
+    assert prompt not in out
+
+
 @pytest.mark.asyncio
-async def test_streaming_response(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
+async def test_streaming_response(foscolo, client, server):
     translation = ""
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        res_no_stream = await client.audio.translations.create(
-            model=model_name,
-            file=foscolo,
-            response_format="json",
-            extra_body=dict(language="it"),
-            temperature=0.0)
-        # Unfortunately this only works when the openai client is patched
-        # to use streaming mode, not exposed in the translation api.
-        original_post = AsyncAPIClient.post
-
-        async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
-            return await original_post(*args, **kwargs)
-
-        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
-            client = remote_server.get_async_client()
-            res = await client.audio.translations.create(model=model_name,
-                                                         file=foscolo,
-                                                         temperature=0.0,
-                                                         extra_body=dict(
-                                                             stream=True,
-                                                             language="it"))
-            # Reconstruct from chunks and validate
-            async for chunk in res:
-                # just a chunk
-                text = chunk.choices[0]['delta']['content']
-                translation += text
-
-        assert translation == res_no_stream.text
+    res_no_stream = await client.audio.translations.create(
+        model=MODEL_NAME,
+        file=foscolo,
+        response_format="json",
+        extra_body=dict(language="it"),
+        temperature=0.0)
+    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    url = server.url_for("v1/audio/translations")
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    data = {
+        "model": MODEL_NAME,
+        "language": "it",
+        "stream": True,
+        "temperature": 0.0,
+    }
+    foscolo.seek(0)
+    async with httpx.AsyncClient() as http_client:
+        files = {"file": foscolo}
+        async with http_client.stream("POST",
+                                      url,
+                                      headers=headers,
+                                      data=data,
+                                      files=files) as response:
+            async for line in response.aiter_lines():
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    line = line[len("data: "):]
+                if line.strip() == "[DONE]":
+                    break
+                chunk = json.loads(line)
+                text = chunk["choices"][0].get("delta", {}).get("content")
+                translation += text or ""
+
+    assert translation == res_no_stream.text
 
 
 @pytest.mark.asyncio
-async def test_stream_options(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        original_post = AsyncAPIClient.post
-
-        async def post_with_stream(*args, **kwargs):
-            kwargs['stream'] = True
-            return await original_post(*args, **kwargs)
-
-        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
-            client = remote_server.get_async_client()
-            res = await client.audio.translations.create(
-                model=model_name,
-                file=foscolo,
-                temperature=0.0,
-                extra_body=dict(language="it",
-                                stream=True,
-                                stream_include_usage=True,
-                                stream_continuous_usage_stats=True))
-            final = False
-            continuous = True
-            async for chunk in res:
-                if not len(chunk.choices):
+async def test_stream_options(foscolo, client, server):
+    url = server.url_for("v1/audio/translations")
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    data = {
+        "model": MODEL_NAME,
+        "language": "it",
+        "stream": True,
+        "stream_include_usage": True,
+        "stream_continuous_usage_stats": True,
+        "temperature": 0.0,
+    }
+    foscolo.seek(0)
+    final = False
+    continuous = True
+    async with httpx.AsyncClient() as http_client:
+        files = {"file": foscolo}
+        async with http_client.stream("POST",
+                                      url,
+                                      headers=headers,
+                                      data=data,
+                                      files=files) as response:
+            async for line in response.aiter_lines():
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    line = line[len("data: "):]
+                if line.strip() == "[DONE]":
+                    break
+                chunk = json.loads(line)
+                choices = chunk.get("choices", [])
+                if not choices:
                     # final usage sent
                     final = True
                 else:
-                    continuous = continuous and hasattr(chunk, 'usage')
-            assert final and continuous
+                    continuous = continuous and ("usage" in chunk)
+    assert final and continuous
 
 
 @pytest.mark.asyncio
-async def test_long_audio_request(foscolo):
-    model_name = "openai/whisper-small"
-    server_args = ["--enforce-eager"]
-
+async def test_long_audio_request(foscolo, client):
     foscolo.seek(0)
     audio, sr = librosa.load(foscolo)
     repeated_audio = np.tile(audio, 2)
@@ -160,13 +172,11 @@ async def test_long_audio_request(foscolo):
     buffer = io.BytesIO()
     sf.write(buffer, repeated_audio, sr, format='WAV')
     buffer.seek(0)
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
-        client = remote_server.get_async_client()
-        translation = await client.audio.translations.create(
-            model=model_name,
-            file=buffer,
-            extra_body=dict(language="it"),
-            response_format="text",
-            temperature=0.0)
-        out = json.loads(translation)['text'].strip().lower()
-        assert out.count("greek sea") == 2
+    translation = await client.audio.translations.create(
+        model=MODEL_NAME,
+        file=buffer,
+        extra_body=dict(language="it"),
+        response_format="text",
+        temperature=0.0)
+    out = json.loads(translation)['text'].strip().lower()
+    assert out.count("greek sea") == 2

From 244f50a6193a327a5dfd2183e6d4e47bfcf66146 Mon Sep 17 00:00:00 2001
From: shixianc <49539556+shixianc@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:02:12 -0700
Subject: [PATCH 109/233] [Fix] enable swap_ab for pplx problem size
 computation (#22991)

Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
---
 .../quantization/cutlass_w8a8/moe/moe_data.cu | 45 +++++++++++++------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
index 857cca1e82df..100f48508444 100644
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -161,6 +161,7 @@ void get_cutlass_moe_mm_data_caller(
       topk_ids.size(1));
 }
 
+template <bool SWAP_AB>
 __global__ void compute_pplx_data(int32_t* expert_offsets,
                                   int32_t* problem_sizes1,
                                   int32_t* problem_sizes2,
@@ -168,14 +169,23 @@ __global__ void compute_pplx_data(int32_t* expert_offsets,
                                   const int padded_m, const int n,
                                   const int k) {
   int expert_idx = threadIdx.x;
-
   expert_offsets[expert_idx] = expert_idx * padded_m;
-  problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
-  problem_sizes1[expert_idx * 3 + 1] = 2 * n;
-  problem_sizes1[expert_idx * 3 + 2] = k;
-  problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
-  problem_sizes2[expert_idx * 3 + 1] = k;
-  problem_sizes2[expert_idx * 3 + 2] = n;
+
+  if constexpr (!SWAP_AB) {
+    problem_sizes1[expert_idx * 3] = expert_num_tokens[expert_idx];
+    problem_sizes1[expert_idx * 3 + 1] = 2 * n;
+    problem_sizes1[expert_idx * 3 + 2] = k;
+    problem_sizes2[expert_idx * 3] = expert_num_tokens[expert_idx];
+    problem_sizes2[expert_idx * 3 + 1] = k;
+    problem_sizes2[expert_idx * 3 + 2] = n;
+  } else {
+    problem_sizes1[expert_idx * 3] = 2 * n;
+    problem_sizes1[expert_idx * 3 + 1] = expert_num_tokens[expert_idx];
+    problem_sizes1[expert_idx * 3 + 2] = k;
+    problem_sizes2[expert_idx * 3] = k;
+    problem_sizes2[expert_idx * 3 + 1] = expert_num_tokens[expert_idx];
+    problem_sizes2[expert_idx * 3 + 2] = n;
+  }
 }
 
 void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
@@ -187,10 +197,19 @@ void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets,
                                          const int64_t n, const int64_t k) {
   auto stream = at::cuda::getCurrentCUDAStream(expert_offsets.device().index());
 
-  compute_pplx_data<<<1, num_local_experts, 0, stream>>>(
-      static_cast<int32_t*>(expert_offsets.data_ptr()),
-      static_cast<int32_t*>(problem_sizes1.data_ptr()),
-      static_cast<int32_t*>(problem_sizes2.data_ptr()),
-      static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
-      k);
+  if (num_local_experts * padded_m > SWAP_AB_THRESHOLD) {
+    compute_pplx_data<false><<<1, num_local_experts, 0, stream>>>(
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+        k);
+  } else {
+    compute_pplx_data<true><<<1, num_local_experts, 0, stream>>>(
+        static_cast<int32_t*>(expert_offsets.data_ptr()),
+        static_cast<int32_t*>(problem_sizes1.data_ptr()),
+        static_cast<int32_t*>(problem_sizes2.data_ptr()),
+        static_cast<const int32_t*>(expert_num_tokens.data_ptr()), padded_m, n,
+        k);
+  }
 }
\ No newline at end of file

From ad60819b1baacb30c0360d98071b51211fa2a35e Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Fri, 15 Aug 2025 14:09:23 -0700
Subject: [PATCH 110/233] Add PrefixRepetitionRandomDataset to `vllm bench
 serve` datasets (#20638)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
---
 vllm/benchmarks/datasets.py | 133 +++++++++++++++++++++++++++++++++++-
 1 file changed, 131 insertions(+), 2 deletions(-)

diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 5299dcf54b39..72d7ce49b8e1 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -26,6 +26,7 @@
 import numpy as np
 from PIL import Image
 from transformers import PreTrainedTokenizerBase
+from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
@@ -486,7 +487,10 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "--dataset-name",
         type=str,
         default="random",
-        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
+        choices=[
+            "sharegpt", "burstgpt", "sonnet", "random", "hf", "custom",
+            "prefix_repetition"
+        ],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument(
@@ -603,6 +607,37 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
         "from the sampled HF dataset.",
     )
 
+    prefix_repetition_group = parser.add_argument_group(
+        "prefix repetition dataset options")
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-prefix-len",
+        type=int,
+        default=256,
+        help="Number of prefix tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-suffix-len",
+        type=int,
+        default=256,
+        help="Number of suffix tokens per request, used only for prefix "
+        "repetition dataset. Total input length is prefix_len + suffix_len.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-num-prefixes",
+        type=int,
+        default=10,
+        help="Number of prefixes to generate, used only for prefix repetition "
+        "dataset. Prompts per prefix is num_requests // num_prefixes.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+
 
 def get_samples(args, tokenizer) -> list[SampleRequest]:
     if args.dataset_name == "custom":
@@ -721,6 +756,17 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
             ),
+            "prefix_repetition":
+            lambda: PrefixRepetitionRandomDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.prefix_repetition_prefix_len,
+                suffix_len=args.prefix_repetition_suffix_len,
+                num_prefixes=args.prefix_repetition_num_prefixes,
+                output_len=args.prefix_repetition_output_len,
+            ),
         }
 
         try:
@@ -828,7 +874,9 @@ def sample(
 # Sonnet Dataset Implementation
 # -----------------------------------------------------------------------------
 
-
+@deprecated(
+    "SonnetDataset is deprecated and will be removed in a future version.",
+)
 class SonnetDataset(BenchmarkDataset):
     """
     Simplified implementation of the Sonnet dataset.  Loads poem lines from a
@@ -1537,3 +1585,84 @@ def sample(
 
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Prefix Repetition Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class PrefixRepetitionRandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the repeated prefix 
+    # dataset.
+    DEFAULT_PREFIX_LEN = 256
+    DEFAULT_SUFFIX_LEN = 256
+    DEFAULT_NUM_PREFIXES = 10
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        random.seed(self.random_seed)
+        np.random.seed(self.random_seed)
+
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        suffix_len: int = DEFAULT_SUFFIX_LEN,
+        num_prefixes: int = DEFAULT_NUM_PREFIXES,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
+        vocab_size = tokenizer.vocab_size
+        prompts_per_prefix = num_requests // num_prefixes
+        if prompts_per_prefix == 0:
+            raise ValueError(
+                f"num_requests ({num_requests}) must be greater than or equal "
+                f"to num_prefixes ({num_prefixes})"
+            )
+
+        def _generate_exact_length_tokens(target_length: int) -> list[int]:
+            """Generate tokens that decode and re-encode to exactly
+            target_length."""
+            # Generate random tokens
+            tokens = np.random.randint(
+                0, vocab_size, size=target_length).tolist()
+            text = tokenizer.decode(tokens)
+            re_encoded = tokenizer.encode(text, add_special_tokens=False)
+
+            if len(re_encoded) == target_length:
+                return re_encoded
+            elif len(re_encoded) < target_length:
+                # Recursively generate additional consistent tokens
+                needed = target_length - len(re_encoded)
+                extra_tokens = _generate_exact_length_tokens(needed)
+                return re_encoded + extra_tokens
+            else:
+                # Truncate to target length
+                return re_encoded[:target_length]
+
+        requests = []
+        for _ in range(num_prefixes):
+            prefix_tokens = _generate_exact_length_tokens(prefix_len)
+
+            for _ in range(prompts_per_prefix):
+                suffix_tokens = _generate_exact_length_tokens(suffix_len)
+
+                combined_tokens = prefix_tokens + suffix_tokens
+                prompt = tokenizer.decode(combined_tokens)
+                prompt_len = len(combined_tokens)
+                requests.append(
+                    SampleRequest(
+                        prompt=prompt,
+                        prompt_len=prompt_len,
+                        expected_output_len=output_len,
+                    )
+                )
+
+        random.shuffle(requests)
+        return requests

From a5ef63d7586aba5420a4ac058c0e8d442bb191ec Mon Sep 17 00:00:00 2001
From: eigen <52445717+yyihuang@users.noreply.github.com>
Date: Fri, 15 Aug 2025 17:38:10 -0400
Subject: [PATCH 111/233] minor: zero workspace buffer init for flashinfer
 trtllm-gen attn (#22603)

---
 tests/kernels/attention/test_flashinfer_trtllm_attention.py | 4 ++--
 vllm/attention/backends/flashinfer.py                       | 2 +-
 vllm/v1/attention/backends/flashinfer.py                    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 53e225ea3ea6..4b84e6a00ece 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -113,7 +113,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
         workspace_buffer,
         kv_layout,
@@ -247,7 +247,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
 
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, kv_layout)
     wrapper.plan(q_indptr,
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 208cacec38eb..a85ec2463283 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -203,7 +203,7 @@ def __init__(self, runner):
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
-            self._workspace_buffer = torch.empty(
+            self._workspace_buffer = torch.zeros(
                 FLASHINFER_WORKSPACE_BUFFER_SIZE,
                 dtype=torch.uint8,
                 device=self.runner.device)
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 02decb171fc0..eac3f33e1509 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -252,7 +252,7 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
-            self._workspace_buffer = torch.empty(
+            self._workspace_buffer = torch.zeros(
                 FLASHINFER_WORKSPACE_BUFFER_SIZE,
                 dtype=torch.uint8,
                 device=self.device)

From d07caa67a875b39d2fa8ed50a6a78d32c0982a11 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 15 Aug 2025 17:41:07 -0400
Subject: [PATCH 112/233] [Attention] FA3 Attention Sinks Perf Boost (#22478)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index d24d8e8e5e79..4e2a0e4533e6 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 93cf5a08f421a3efd0c4a7e005ef8f742b578ce0
+          GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 1a8dba42916859eace5c9392d24a7d6a8e3057d5 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 16 Aug 2025 00:55:26 +0200
Subject: [PATCH 113/233] [BugFix] Fix regression caused by mamba state dtype
 PR (#22998)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/model_executor/models/phi4flash.py | 8 ++++++--
 vllm/model_executor/models/plamo2.py    | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py
index 493a4192d35a..fcdfcb7bc160 100644
--- a/vllm/model_executor/models/phi4flash.py
+++ b/vllm/model_executor/models/phi4flash.py
@@ -650,8 +650,12 @@ def forward(
             num_mamba_layers = self.config.num_hidden_layers \
                 // 2 // self.config.mb_per_layer + 1
             self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
+                self.vllm_config,
+                num_mamba_layers,
+                *self._get_mamba_cache_shape(),
+                self.lm_head.weight.dtype,
+                self.lm_head.weight.dtype,
+            )
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
         attn_metadata = get_forward_context().attn_metadata
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 8b1df66f0280..e5034b536266 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -767,8 +767,12 @@ def forward(self,
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
 
             self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
-                *self._get_mamba_cache_shape())
+                self.vllm_config,
+                num_mamba_layers,
+                *self._get_mamba_cache_shape(),
+                self.lm_head.weight.dtype,
+                self.lm_head.weight.dtype,
+            )
 
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 

From 0bb7e0b5a5e8db3e3c1ac9d18a1d934d30c467c9 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Fri, 15 Aug 2025 16:16:23 -0700
Subject: [PATCH 114/233] ci: Add CUDA + arm64 release builds (#21201)

Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
---
 .buildkite/release-pipeline.yaml | 16 ++++++++++++++++
 docker/Dockerfile                | 17 ++---------------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 6314afd65234..85d3e5638742 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,4 +1,20 @@
 steps:
+  # aarch64 + CUDA builds
+  - label: "Build arm64 wheel - CUDA 12.8"
+    id: build-wheel-arm64-cuda-12-8
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
+      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 + CUDA builds
   - label: "Build wheel - CUDA 12.8"
     id: build-wheel-cuda-12-8
     agents:
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 66a6e6fd6f67..74938917781a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -139,21 +139,6 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-
-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            --pre pytorch_triton==3.3.0+gitab727c40; \
-    fi
-
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -234,6 +219,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && sccache --show-stats; \
     fi
 
+ARG vllm_target_device="cuda"
+ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \

From c17354fea695094c808898bc707cc4fa9f5fcb78 Mon Sep 17 00:00:00 2001
From: rishitdholakia13 <123388671+rishitdholakia13@users.noreply.github.com>
Date: Fri, 15 Aug 2025 17:25:05 -0600
Subject: [PATCH 115/233] [Structured Outputs] [Bug] Fix misalignment in
 apply_grammar_bitmask causing unintended masking and NaN logits (#22963)

Signed-off-by: rishitdholakia13 <rishit+github@cohere.com>
---
 vllm/v1/worker/gpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bef67486d518..4c919b392fbd 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1355,10 +1355,10 @@ def apply_grammar_bitmask(
             cumulative_index += 1 + num_spec_tokens
         grammar_bitmask = sorted_bitmask
 
-        # If the grammar bitmask and the logits have the same shape
+        # If the length of out indices and the logits have the same shape
         # we don't need to pass indices to the kernel,
         # since the bitmask is already aligned with the logits.
-        skip_out_indices = grammar_bitmask.shape[0] == logits.shape[0]
+        skip_out_indices = len(out_indices) == logits.shape[0]
 
         # Serialization of np.ndarray is much more efficient than a tensor,
         # so we receive it in that format.

From c587b1b3ecd51530995c543796d87e583d908cc8 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 15 Aug 2025 16:38:42 -0700
Subject: [PATCH 116/233] [BugFix] Handle case where async utility call is
 cancelled (#22996)

Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Yinghai Lu <yinghai@thinkingmachines.ai>
---
 tests/v1/engine/test_engine_core_client.py | 24 +++++++++++++++++++++-
 vllm/v1/engine/core_client.py              | 21 +++++++++++++------
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index c82285639aee..37eb869fe69a 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -121,8 +121,13 @@ async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict):
 
 
 # Dummy utility function to monkey-patch into engine core.
-def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
+def echo(self,
+         msg: str,
+         err_msg: Optional[str] = None,
+         sleep: Optional[float] = None) -> str:
     print(f"echo util function called: {msg}, {err_msg}")
+    if sleep is not None:
+        time.sleep(sleep)
     if err_msg is not None:
         raise ValueError(err_msg)
     return msg
@@ -289,6 +294,23 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
                 await core_client.call_utility_async("echo", None, "help!")
 
             assert str(e_info.value) == "Call to echo method failed: help!"
+
+            # Test that cancelling the utility call doesn't destabilize the
+            # engine.
+            util_task = asyncio.create_task(
+                core_client.call_utility_async("echo", "testarg2", None,
+                                               0.5))  # sleep for 0.5 sec
+            await asyncio.sleep(0.05)
+            cancelled = util_task.cancel()
+            assert cancelled
+
+            # Ensure client is still functional. The engine runs utility
+            # methods in a single thread so this request won't be processed
+            # until the cancelled sleeping one is complete.
+            result = await asyncio.wait_for(core_client.call_utility_async(
+                "echo", "testarg3"),
+                                            timeout=1.0)
+            assert result == "testarg3"
         finally:
             client.shutdown()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 29ee0a9dfb1e..079dd9a7d38d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -574,13 +574,22 @@ def monitor_engine_cores():
 
 def _process_utility_output(output: UtilityOutput,
                             utility_results: dict[int, AnyFuture]):
-    """Set the result from a utility method in the waiting future"""
+    """Set the result from a utility method in the waiting future."""
     future = utility_results.pop(output.call_id)
-    if output.failure_message is not None:
-        future.set_exception(Exception(output.failure_message))
-    else:
-        assert output.result is not None
-        future.set_result(output.result.result)
+    failure_message = output.failure_message
+    try:
+        if failure_message is not None:
+            future.set_exception(Exception(failure_message))
+        else:
+            assert output.result is not None
+            future.set_result(output.result.result)
+    except asyncio.InvalidStateError:
+        # This can happen if the future is cancelled due to the
+        # original calling task being cancelled.
+        if failure_message is not None:
+            logger.error(
+                "Cancelled call to utility method failed "
+                "with error: %s", failure_message)
 
 
 class SyncMPClient(MPClient):

From 49a48d07bdac6ac9be442343453aa5ec3fe7631b Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Sat, 16 Aug 2025 02:52:52 +0300
Subject: [PATCH 117/233] [v1] Move block_hashes from KVCacheManager to
 Request.block_hashes (#19728)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 tests/v1/core/test_async_scheduler.py         |  22 +-
 tests/v1/core/test_kv_cache_utils.py          |  50 ++--
 tests/v1/core/test_prefix_caching.py          | 225 ++++++++++--------
 tests/v1/core/test_scheduler.py               |  29 ++-
 .../core/test_single_type_kv_cache_manager.py |   2 -
 tests/v1/core/utils.py                        |  17 +-
 .../kv_connector/unit/test_nixl_connector.py  |   2 +
 .../unit/test_remote_decode_lifecycle.py      |  10 +-
 .../unit/test_remote_prefill_lifecycle.py     |  17 +-
 tests/v1/kv_connector/unit/utils.py           |  31 ++-
 vllm/utils/__init__.py                        |  18 ++
 vllm/v1/core/block_pool.py                    |  75 ++----
 vllm/v1/core/kv_cache_coordinator.py          |  33 +--
 vllm/v1/core/kv_cache_manager.py              |  51 +---
 vllm/v1/core/kv_cache_utils.py                |  80 ++++---
 vllm/v1/core/sched/scheduler.py               |   2 -
 vllm/v1/core/single_type_kv_cache_manager.py  |  10 +-
 vllm/v1/engine/core.py                        |  22 +-
 vllm/v1/request.py                            |  22 +-
 19 files changed, 382 insertions(+), 336 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 3ccefbd81cab..3a9492269f9c 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -7,6 +7,7 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import RequestStatus
+from vllm.v1.utils import ConstantList
 
 from .utils import create_requests, create_scheduler
 
@@ -140,7 +141,8 @@ def test_prefix_caching_for_prefill_dedup():
     requests = create_requests(num_requests=5,
                                num_tokens=num_prompt_tokens,
                                max_tokens=3,
-                               same_prompt=True)
+                               same_prompt=True,
+                               block_size=BLOCK_SIZE)
     requests_copy = requests.copy()
 
     # Two requests with the same prompt.
@@ -188,7 +190,8 @@ def test_prefix_caching_for_multi_turn():
                                  block_size=BLOCK_SIZE)
     requests = create_requests(num_requests=5,
                                num_tokens=num_prompt_tokens,
-                               max_tokens=num_output_tokens)
+                               max_tokens=num_output_tokens,
+                               block_size=BLOCK_SIZE)
 
     for req in requests:
         scheduler.add_request(req)
@@ -208,14 +211,19 @@ def test_prefix_caching_for_multi_turn():
 
     # Create next-turn requests whose prompts are the full output of the
     # previous turn.
-    next_turn_requests = create_requests(
-        num_requests=5,
-        num_tokens=num_prompt_tokens + num_output_tokens,
-        max_tokens=num_output_tokens,
-    )
+    next_turn_requests = create_requests(num_requests=5,
+                                         num_tokens=num_prompt_tokens +
+                                         num_output_tokens,
+                                         max_tokens=num_output_tokens,
+                                         block_size=BLOCK_SIZE)
     for i, req in enumerate(next_turn_requests):
         req.prompt_token_ids = (requests[i].prompt_token_ids +
                                 list(requests[i].output_token_ids))
+        req._all_token_ids = req.prompt_token_ids.copy()
+        req.all_token_ids = ConstantList(req._all_token_ids)
+        req.block_hashes = []
+        req.block_hashes = req.get_hash_new_full_blocks()
+
     # Schedule the next-turn requests.
     for req in next_turn_requests:
         scheduler.add_request(req)
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 182ea2b2345c..e0b91e6dd7ee 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import importlib
-from typing import Optional
+from typing import Callable, Optional
 
 import pytest
 import torch
@@ -19,7 +19,7 @@
     FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics,
     estimate_max_model_len, generate_block_hash_extra_keys,
     get_kv_cache_config, get_max_concurrency_for_kv_cache_config,
-    hash_block_tokens, hash_request_tokens, init_none_hash,
+    get_request_block_hasher, hash_block_tokens, init_none_hash,
     is_kv_cache_type_uniform, unify_kv_cache_configs)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, KVCacheTensor,
@@ -33,6 +33,8 @@
 def make_request(
     request_id: str,
     prompt_token_ids: list[int],
+    block_size: int = 3,
+    hash_fn: Callable = hash,
     mm_positions: Optional[list[PlaceholderRange]] = None,
     mm_hashes: Optional[list[str]] = None,
     cache_salt: Optional[str] = None,
@@ -49,18 +51,17 @@ def make_request(
         mm_item = MultiModalKwargsItem.from_elems([mm_elem])
         mm_kwargs = [mm_item] * len(mm_positions)
 
-    return Request(
-        request_id=request_id,
-        prompt_token_ids=prompt_token_ids,
-        multi_modal_kwargs=mm_kwargs,
-        multi_modal_hashes=mm_hashes,
-        multi_modal_placeholders=mm_positions,
-        sampling_params=SamplingParams(max_tokens=17),
-        pooling_params=None,
-        eos_token_id=100,
-        lora_request=None,
-        cache_salt=cache_salt,
-    )
+    return Request(request_id=request_id,
+                   prompt_token_ids=prompt_token_ids,
+                   multi_modal_kwargs=mm_kwargs,
+                   multi_modal_hashes=mm_hashes,
+                   multi_modal_placeholders=mm_positions,
+                   sampling_params=SamplingParams(max_tokens=17),
+                   pooling_params=None,
+                   eos_token_id=100,
+                   lora_request=None,
+                   cache_salt=cache_salt,
+                   block_hasher=get_request_block_hasher(block_size, hash_fn))
 
 
 def new_kv_cache_spec(block_size=16,
@@ -428,12 +429,14 @@ def test_hash_block_tokens(hash_fn):
 
 
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor_64bit, hash])
-def test_hash_request_tokens(hash_fn):
+def test_request_block_hasher(hash_fn):
     import vllm.v1.core.kv_cache_utils
     init_none_hash(hash_fn)
     request = make_request(
         request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
+        block_size=3,
+        hash_fn=hash_fn,
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
             PlaceholderRange(offset=3, length=3),
@@ -441,9 +444,7 @@ def test_hash_request_tokens(hash_fn):
         mm_hashes=["hash1", "hash2"],
     )
 
-    block_size = 3
-    block_hashes = hash_request_tokens(hash_fn, block_size, request)
-
+    block_hashes = request.block_hashes
     assert len(block_hashes) == 2
     assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash)
     assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash)
@@ -464,6 +465,8 @@ def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
+        block_size=3,
+        hash_fn=hash_fn,
         mm_positions=[
             PlaceholderRange(offset=0, length=3),
             PlaceholderRange(offset=3, length=3),
@@ -479,9 +482,8 @@ def test_hash_tokens_different_mm_input(hash_fn):
         ],
         mm_hashes=["hash3", "hash2"],
     )
-    block_size = 3
-    block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
-    block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
+    block_hashes1 = request1.block_hashes
+    block_hashes2 = request2.block_hashes
     assert block_hashes1[0] != block_hashes2[0]
     assert block_hashes1[1] != block_hashes2[1]
 
@@ -493,12 +495,13 @@ def test_hash_request_tokens_no_mm_inputs(hash_fn):
     request = make_request(
         request_id="0",
         prompt_token_ids=[_ for _ in range(6)],
+        block_size=3,
+        hash_fn=hash_fn,
         mm_positions=None,
         mm_hashes=None,
     )
 
-    block_size = 3
-    block_hashes = hash_request_tokens(hash_fn, block_size, request)
+    block_hashes = request.block_hashes
 
     assert len(block_hashes) == 2
     assert block_hashes[0].token_ids == (0, 1, 2)
@@ -858,6 +861,7 @@ def test_allocate_with_lookahead():
     request = make_request(
         request_id="0",
         prompt_token_ids=[],
+        block_size=block_size,
         mm_positions=None,
         mm_hashes=None,
     )
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 87acdef22013..28cfca6767b1 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -3,7 +3,7 @@
 """Compare the with and without prefix caching."""
 
 import copy
-from typing import Optional
+from typing import Callable, Optional
 
 import pytest
 import torch
@@ -17,8 +17,9 @@
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         KVCacheBlock, hash_block_tokens,
-                                         init_none_hash)
+                                         KVCacheBlock,
+                                         get_request_block_hasher,
+                                         hash_block_tokens, init_none_hash)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec, SlidingWindowSpec)
 
@@ -26,6 +27,8 @@
 def make_request(
     request_id: str,
     prompt_token_ids: list[int],
+    block_size: int,
+    hash_fn: Callable,
     mm_positions: Optional[list[PlaceholderRange]] = None,
     mm_hashes: Optional[list[str]] = None,
     prompt_logprobs: Optional[int] = None,
@@ -43,19 +46,18 @@ def make_request(
         mm_item = MultiModalKwargsItem.from_elems([mm_elem])
         mm_kwargs = [mm_item] * len(mm_positions)
 
-    return Request(
-        request_id=request_id,
-        prompt_token_ids=prompt_token_ids,
-        multi_modal_kwargs=mm_kwargs,
-        multi_modal_hashes=mm_hashes,
-        multi_modal_placeholders=mm_positions,
-        sampling_params=SamplingParams(max_tokens=17,
-                                       prompt_logprobs=prompt_logprobs),
-        pooling_params=None,
-        eos_token_id=100,
-        lora_request=None,
-        cache_salt=cache_salt,
-    )
+    return Request(request_id=request_id,
+                   prompt_token_ids=prompt_token_ids,
+                   multi_modal_kwargs=mm_kwargs,
+                   multi_modal_hashes=mm_hashes,
+                   multi_modal_placeholders=mm_positions,
+                   sampling_params=SamplingParams(
+                       max_tokens=17, prompt_logprobs=prompt_logprobs),
+                   pooling_params=None,
+                   eos_token_id=100,
+                   lora_request=None,
+                   cache_salt=cache_salt,
+                   block_hasher=get_request_block_hasher(block_size, hash_fn))
 
 
 def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
@@ -105,11 +107,11 @@ def make_kv_cache_config_hybrid_model(block_size: int,
 
 @pytest.mark.parametrize("hash_algo", ["sha256", "sha256_cbor_64bit", "hash"])
 def test_prefill(hash_algo):
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
-        caching_hash_algo=hash_algo,
     )
 
     # choose the hash function according to the parameter
@@ -123,9 +125,9 @@ def test_prefill(hash_algo):
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids)
+    req0 = make_request("0", all_token_ids, block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
@@ -152,9 +154,10 @@ def test_prefill(hash_algo):
     # Cache hit in the common prefix when the original block is still in use.
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
+                        hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(req1.block_hashes) == 3
     assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
@@ -187,9 +190,10 @@ def test_prefill(hash_algo):
     # Cache hit in the common prefix when the original block is already free.
     # Incomplete 1 block (6 tokens)
     unique_token_ids = [3] * 6
-    req2 = make_request("2", common_token_ids + unique_token_ids)
+    req2 = make_request("2", common_token_ids + unique_token_ids, block_size,
+                        hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(manager.req_to_block_hashes[req2.request_id]) == 3
+    assert len(req2.block_hashes) == 3
     assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
@@ -208,7 +212,7 @@ def test_prefill(hash_algo):
     manager.free(req2)
 
     # Cache miss and eviction.
-    req3 = make_request("3", [99] * (16 * 10))
+    req3 = make_request("3", [99] * (16 * 10), block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -242,9 +246,9 @@ def test_prefill_hybrid_model():
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids)
+    req0 = make_request("0", all_token_ids, block_size, hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
@@ -274,9 +278,10 @@ def test_prefill_hybrid_model():
     # Cache hit in the common prefix
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
+                        hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(req1.block_hashes) == 3
     assert computed_blocks.get_block_ids() == ([1, 2, 3], [0, 6,
                                                            7], [0, 10, 11])
     assert num_computed_tokens == 3 * 16
@@ -290,7 +295,7 @@ def test_prefill_hybrid_model():
             if block != manager.block_pool.null_block:
                 assert block.ref_cnt == 2
 
-    block_hashes = manager.req_to_block_hashes[req1.request_id]
+    block_hashes = req1.block_hashes
     manager.free(req0)
     manager.free(req1)
 
@@ -300,12 +305,13 @@ def test_prefill_hybrid_model():
     def test_partial_request_hit(request_id: str,
                                  hash_to_evict: list[BlockHashWithGroupId],
                                  expect_hit_length: int):
-        req = make_request(request_id, common_token_ids + unique_token_ids)
+        req = make_request(request_id, common_token_ids + unique_token_ids,
+                           block_size, hash)
         for hash_with_group_id in hash_to_evict:
             manager.block_pool.cached_block_hash_to_block.pop(
                 hash_with_group_id)
         computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
-        assert len(manager.req_to_block_hashes[req.request_id]) == 3
+        assert len(req.block_hashes) == 3
         assert num_computed_tokens == expect_hit_length * block_size
         for block_per_group in computed_blocks.blocks:
             assert len(block_per_group) == num_computed_tokens // block_size
@@ -364,8 +370,9 @@ def test_prefill_plp():
     2. Schedule non-plp request and validate blocks
     3. Schedule plp request; no hit should occur; validate blocks
     '''
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
@@ -380,9 +387,13 @@ def test_prefill_plp():
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
     all_token_ids = common_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids, prompt_logprobs=5)
+    req0 = make_request("0",
+                        all_token_ids,
+                        block_size,
+                        hash_fn,
+                        prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert len(manager.req_to_block_hashes[req0.request_id]) == 0
+    assert len(req0.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55,
@@ -411,9 +422,10 @@ def test_prefill_plp():
     # Cache hit in the common prefix when the original block is still in use.
     # Incomplete 1 block (5 tokens)
     unique_token_ids = [3] * 5
-    req1 = make_request("1", common_token_ids + unique_token_ids)
+    req1 = make_request("1", common_token_ids + unique_token_ids, block_size,
+                        hash_fn)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(req1.block_hashes) == 3
     assert computed_blocks.get_block_ids() == ([1, 2, 3], )
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
@@ -447,9 +459,11 @@ def test_prefill_plp():
     unique_token_ids = [3] * 6
     req2 = make_request("2",
                         common_token_ids + unique_token_ids,
+                        block_size,
+                        hash_fn,
                         prompt_logprobs=5)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(manager.req_to_block_hashes[req2.request_id]) == 0
+    assert len(req2.block_hashes) == 3
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req2, 55,
@@ -469,8 +483,9 @@ def test_prefill_plp():
 
 
 def test_decode():
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
@@ -481,7 +496,8 @@ def test_decode():
     # Fully cache miss
     # Incomplete 1 block (7 tokens)
     unique_token_ids = [3] * 7
-    req0 = make_request("0", common_token_ids + unique_token_ids)
+    req0 = make_request("0", common_token_ids + unique_token_ids, block_size,
+                        hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -518,14 +534,15 @@ def test_decode():
 
 
 def test_evict():
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
 
     last_token_id = 5 * 16 + 7
-    req0 = make_request("0", list(range(last_token_id)))
+    req0 = make_request("0", list(range(last_token_id)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -536,7 +553,8 @@ def test_evict():
 
     # 3 blocks.
     req1 = make_request("1", list(range(last_token_id,
-                                        last_token_id + 3 * 16)))
+                                        last_token_id + 3 * 16)), block_size,
+                        hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -558,7 +576,7 @@ def test_evict():
     ] == [10, 6, 5, 4, 3, 2, 1, 9, 8, 7]
 
     # Touch the first 2 blocks.
-    req2 = make_request("2", list(range(2 * 16 + 3)))
+    req2 = make_request("2", list(range(2 * 16 + 3)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert computed_blocks.get_block_ids() == ([1, 2], )
     assert num_computed_tokens == 2 * 16
@@ -583,7 +601,7 @@ def test_hash_block_correct_reuse():
 
     # Allocate 1 block and cache it.
     num_tokens = block_size * 1
-    req = make_request("0", list(range(num_tokens)))
+    req = make_request("0", list(range(num_tokens)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -597,7 +615,7 @@ def test_hash_block_correct_reuse():
 
     # Allocate a new block that's not full, make sure hash info on the
     # block is cleared.
-    req = make_request("1", list(range(num_tokens - 1)))
+    req = make_request("1", list(range(num_tokens - 1)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -624,7 +642,7 @@ def test_computed_blocks_not_evicted():
 
     # Allocate a block and cache it.
     num_tokens = block_size * 1
-    req0 = make_request("0", list(range(num_tokens)))
+    req0 = make_request("0", list(range(num_tokens)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -635,7 +653,8 @@ def test_computed_blocks_not_evicted():
     assert blocks.blocks[0][0].block_id == 1
 
     # Allocate another block.
-    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
+    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)),
+                        block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -651,7 +670,7 @@ def test_computed_blocks_not_evicted():
 
     # Now if we have a cache hit on the first block, we should evict the second
     # cached block rather than the first one.
-    req2 = make_request("2", list(range(num_tokens * 2)))
+    req2 = make_request("2", list(range(num_tokens * 2)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks.blocks[0]) == 1
     assert computed_blocks.blocks[0][0].block_id == 1
@@ -675,7 +694,8 @@ def test_basic_prefix_caching_disabled():
         enable_caching=False,
     )
 
-    req1 = make_request("1", list(range(10)))  # 2 blocks and some more
+    req1 = make_request("1", list(range(10)), block_size,
+                        hash)  # 2 blocks and some more
 
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert not computed_blocks.blocks[0]
@@ -689,7 +709,8 @@ def test_basic_prefix_caching_disabled():
     manager.free(req1)
 
     # No caching.
-    req2 = make_request("2", list(range(16)))  # shared prefix
+    req2 = make_request("2", list(range(16)), block_size,
+                        hash)  # shared prefix
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -699,7 +720,7 @@ def test_basic_prefix_caching_disabled():
     assert len(blocks.blocks[0]) == 4
 
     # New requests should not have any blocks.
-    req3 = make_request("3", list(range(4)))
+    req3 = make_request("3", list(range(4)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -727,20 +748,17 @@ def test_cache_blocks(hash_fn):
     #  Block 1: [4, 5, 6, 7]
     #  Block 2: [8, 9, 10, 11]
     #  Block 3: [12, 13]
-    req = make_request("0", list(range(14)))
+    req = make_request("0", list(range(14)), block_size, hash_fn)
 
     # Test that blocks are cached correctly for 2 full blocks from the start.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
-    block_hashes: list[BlockHash] = []
 
     block_pool.cache_full_blocks(
         request=req,
         blocks=blocks,
-        block_hashes=block_hashes,
         num_cached_blocks=0,
         num_full_blocks=2,
         block_size=block_size,
-        hash_fn=hash_fn,
         kv_cache_group_id=0,
     )
 
@@ -752,11 +770,9 @@ def test_cache_blocks(hash_fn):
     block_pool.cache_full_blocks(
         request=req,
         blocks=blocks,
-        block_hashes=block_hashes,
         num_cached_blocks=2,
         num_full_blocks=3,
         block_size=block_size,
-        hash_fn=hash_fn,
         kv_cache_group_id=0,
     )
     assert len(block_pool.cached_block_hash_to_block) == 3
@@ -775,23 +791,20 @@ def test_cache_blocks_multi_group():
     #  Block 1/5: [4, 5, 6, 7]
     #  Block 2/6: [8, 9, 10, 11]
     #  Block 3/7: [12, 13]
-    req = make_request("0", list(range(14)))
+    req = make_request("0", list(range(14)), block_size, hash)
 
     # Cache the blocks for group 0.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
-    block_hashes: list[BlockHash] = []
     block_pool.cache_full_blocks(
         request=req,
         blocks=blocks,
-        block_hashes=block_hashes,
         num_cached_blocks=0,
         num_full_blocks=2,
         block_size=block_size,
-        hash_fn=hash,
         kv_cache_group_id=0,
     )
     assert len(block_pool.cached_block_hash_to_block) == 2
-    assert len(block_hashes) == 2
+    assert len(req.block_hashes) == 3
     assert all([block.block_hash is not None for block in blocks])
 
     # Cache the blocks for group 1.
@@ -799,38 +812,36 @@ def test_cache_blocks_multi_group():
     block_pool.cache_full_blocks(
         request=req,
         blocks=blocks,
-        block_hashes=block_hashes,
         num_cached_blocks=0,
         num_full_blocks=3,
         block_size=block_size,
-        hash_fn=hash,
         kv_cache_group_id=1,
     )
     assert len(block_pool.cached_block_hash_to_block) == 5
-    assert len(block_hashes) == 3
+    assert len(req.block_hashes) == 3
     assert all([block.block_hash is not None for block in blocks])
 
     # Block hash 0: hit for group 0 and 1
     # Block hash 1: hit for group 0 and 1
     # Block hash 2: hit for group 1
 
-    assert block_pool.get_cached_block(block_hashes[0],
+    assert block_pool.get_cached_block(req.block_hashes[0],
                                        kv_cache_group_ids=[0]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
+    assert block_pool.get_cached_block(req.block_hashes[1],
                                        kv_cache_group_ids=[0]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
+    assert block_pool.get_cached_block(req.block_hashes[2],
                                        kv_cache_group_ids=[0]) is None
-    assert block_pool.get_cached_block(block_hashes[0],
+    assert block_pool.get_cached_block(req.block_hashes[0],
                                        kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
+    assert block_pool.get_cached_block(req.block_hashes[1],
                                        kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
+    assert block_pool.get_cached_block(req.block_hashes[2],
                                        kv_cache_group_ids=[1]) is not None
-    assert block_pool.get_cached_block(block_hashes[0],
+    assert block_pool.get_cached_block(req.block_hashes[0],
                                        kv_cache_group_ids=[0, 1]) is not None
-    assert block_pool.get_cached_block(block_hashes[1],
+    assert block_pool.get_cached_block(req.block_hashes[1],
                                        kv_cache_group_ids=[0, 1]) is not None
-    assert block_pool.get_cached_block(block_hashes[2],
+    assert block_pool.get_cached_block(req.block_hashes[2],
                                        kv_cache_group_ids=[0, 1]) is None
 
 
@@ -838,8 +849,9 @@ def test_mm_prefix_caching():
     """
     This tests that the multi-modal prefix caching is correct.
     """
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
@@ -865,6 +877,8 @@ def test_mm_prefix_caching():
     mm_hashes = common_mm_hashes + ["ccc"]
     req0 = make_request("0",
                         all_token_ids,
+                        block_size,
+                        hash,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
@@ -872,7 +886,7 @@ def test_mm_prefix_caching():
     # Completed block should have hashes with extra keys.
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    block_hashes = req0.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0].extra_keys == ("aaa", )
     assert block_hashes[1].extra_keys == ("aaa", "bbb")
@@ -905,6 +919,8 @@ def test_mm_prefix_caching():
     mm_hashes = common_mm_hashes + ["ccc"]
     req1 = make_request("1",
                         all_token_ids,
+                        block_size,
+                        hash,
                         mm_positions=mm_positions,
                         mm_hashes=mm_hashes)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
@@ -927,13 +943,13 @@ def test_cache_key_salting():
     # 3 complete blocks and an incomplete block with 11 tokens.
     common_token_ids = [i for i in range(3) for _ in range(block_size)]
     token_ids = common_token_ids + [3] * 11
-    req0 = make_request("0", token_ids, cache_salt="salt1")
+    req0 = make_request("0", token_ids, block_size, hash, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
 
     # Completed block should have hashes with extra keys.
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
-    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    block_hashes = req0.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0].extra_keys == ("salt1", )
     assert block_hashes[1].extra_keys is None
@@ -959,7 +975,7 @@ def test_cache_key_salting():
 
     # Test cache hit with a new request that has the same salt.
     token_ids = common_token_ids + [4] * 11
-    req1 = make_request("1", token_ids, cache_salt="salt1")
+    req1 = make_request("1", token_ids, block_size, hash, cache_salt="salt1")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     # Should match only a prefix of 3 blocks.
     assert len(computed_blocks.blocks[0]) == 3
@@ -967,11 +983,11 @@ def test_cache_key_salting():
 
     # Test cache miss with same content but different salt.
     token_ids = common_token_ids + [4] * 11
-    req2 = make_request("2", token_ids, cache_salt="salt2")
+    req2 = make_request("2", token_ids, block_size, hash, cache_salt="salt2")
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks.blocks[0]) == 0
     assert num_computed_tokens == 0
-    block_hashes = manager.req_to_block_hashes[req2.request_id]
+    block_hashes = req2.block_hashes
     assert len(block_hashes) == 3
     assert block_hashes[0].extra_keys == ("salt2", )
 
@@ -992,7 +1008,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
     common_token_ids = [i for i in range(3) for _ in range(16)]
-    req0 = make_request("0", common_token_ids)
+    req0 = make_request("0", common_token_ids, block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -1003,7 +1019,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
         req0.request_id]
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
-    req1 = make_request("1", common_token_ids * 2)
+    req1 = make_request("1", common_token_ids * 2, block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert computed_blocks.blocks[0] == block_part0
     assert num_computed_tokens == 3 * 16
@@ -1020,19 +1036,19 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
 
     # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
     # | Req1-5(F)| Req2-0   | Req2-1   | ... |
-    req2 = make_request("2", [7] * block_size * 2)
+    req2 = make_request("2", [7] * block_size * 2, block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
     manager.allocate_slots(req2, block_size * 2,
-                           len(computed_blocks.blocks[0]) * 16,
+                           len(computed_blocks.blocks[0]) * block_size,
                            computed_blocks)
 
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
     # but it cannot be allocated due to insufficient free blocks (2).
     # In this case, the ref_cnt of the computed blocks should not be changed.
     assert manager.block_pool.free_block_queue.num_free_blocks == 5
-    req3 = make_request("3", common_token_ids * 3)
+    req3 = make_request("3", common_token_ids * 3, block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert computed_blocks.blocks[0] == block_part1
     assert num_computed_tokens == 6 * 16
@@ -1047,8 +1063,9 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
 
 
 def test_reset_prefix_cache():
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
     )
@@ -1056,15 +1073,15 @@ def test_reset_prefix_cache():
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
     unique_token_ids = [3] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
-    req0 = make_request("0", all_token_ids)
+    req0 = make_request("0", all_token_ids, block_size, hash)
     blocks = manager.allocate_slots(req0, 55)
     assert blocks.get_block_ids() == ([1, 2, 3, 4], )
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
-    req1 = make_request("1", all_token_ids)
+    req1 = make_request("1", all_token_ids, block_size, hash)
     computed_blocks, _ = manager.get_computed_blocks(req1)
-    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert len(req1.block_hashes) == 3
     assert len(computed_blocks.blocks[0]) == 3
     blocks = manager.allocate_slots(req1, 7,
                                     len(computed_blocks.blocks[0]) * 16,
@@ -1086,8 +1103,9 @@ def test_reset_prefix_cache():
 
 def test_prefix_cache_stats_disabled():
     """Test that prefix_cache_stats is None when log_stats is False."""
+    block_size = 16
     manager = KVCacheManager(
-        make_kv_cache_config(16, 11),
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
         log_stats=False,  # Disable logging stats
@@ -1095,7 +1113,7 @@ def test_prefix_cache_stats_disabled():
     assert manager.prefix_cache_stats is None
 
     # Call all functions that check whether log_stats is disabled.
-    req = make_request("0", list(range(16)))
+    req = make_request("0", list(range(16)), block_size, hash)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
     assert not computed_blocks.blocks[0]
     assert num_computed_tokens == 0
@@ -1192,7 +1210,7 @@ def test_kv_cache_events(blocks_to_cache: int):
     )
 
     num_tokens = block_size * blocks_to_cache
-    req0 = make_request("0", list(range(num_tokens)))
+    req0 = make_request("0", list(range(num_tokens)), block_size, hash)
     _ = manager.allocate_slots(req0, num_tokens)
     events = manager.take_events()
 
@@ -1208,7 +1226,7 @@ def test_kv_cache_events(blocks_to_cache: int):
     # Should see block_to_cache number of removed block events and a new block
     # stored event
     manager.free(req0)
-    req1 = make_request("1", list(range(num_tokens)))
+    req1 = make_request("1", list(range(num_tokens)), block_size, hash)
     _ = manager.allocate_slots(req1, num_tokens)
     events = manager.take_events()
 
@@ -1242,7 +1260,7 @@ def test_eagle_enabled_removes_last_block():
 
     # Request with 3 full blocks (48 tokens)
     token_ids = [0] * (3 * block_size)
-    req = make_request("divisible_request", token_ids)
+    req = make_request("divisible_request", token_ids, block_size, hash)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1252,7 +1270,7 @@ def test_eagle_enabled_removes_last_block():
     manager.free(req)
 
     # New request with same tokens + Eagle enabled
-    req_eagle = make_request("eagle_divisible", token_ids)
+    req_eagle = make_request("eagle_divisible", token_ids, block_size, hash)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
 
     # Should retain 1 block:
@@ -1273,7 +1291,7 @@ def test_eagle_with_partial_blocks():
     )
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
-    req = make_request("partial_block_test", token_ids)
+    req = make_request("partial_block_test", token_ids, block_size, hash)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1283,7 +1301,7 @@ def test_eagle_with_partial_blocks():
     manager.free(req)
 
     # New request with Eagle enabled
-    req_eagle = make_request("partial_eagle", token_ids)
+    req_eagle = make_request("partial_eagle", token_ids, block_size, hash)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
     assert len(computed_blocks.blocks[0]) == 1
@@ -1314,7 +1332,7 @@ def test_eagle_with_sliding_window():
 
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
-    req = make_request("partial_block_test", token_ids)
+    req = make_request("partial_block_test", token_ids, block_size, hash)
 
     # Prime the cache
     computed_blocks, _ = manager.get_computed_blocks(req)
@@ -1322,12 +1340,12 @@ def test_eagle_with_sliding_window():
                            len(computed_blocks.blocks[0]) * 16,
                            computed_blocks)
     # record the block hash of the first block in the request for later use
-    block_hash_first_block = manager.req_to_block_hashes[req.request_id][0]
+    block_hash_first_block = req.block_hashes[0]
     assert block_hash_first_block is not None
     manager.free(req)
 
     # New request with Eagle enabled
-    req_eagle = make_request("partial_eagle", token_ids)
+    req_eagle = make_request("partial_eagle", token_ids, block_size, hash)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_eagle)
     # Original match: 2 full blocks → Eagle removes 1 → 1 remaining
     assert len(computed_blocks.blocks[0]) == 1
@@ -1340,7 +1358,8 @@ def test_eagle_with_sliding_window():
         BlockHashWithGroupId(block_hash_first_block, 0))
 
     # New request
-    req_after_evict = make_request("partial_eagle_after_evict", token_ids)
+    req_after_evict = make_request("partial_eagle_after_evict", token_ids,
+                                   block_size, hash)
     computed_blocks, num_tokens = manager.get_computed_blocks(req_after_evict)
     # Cache miss. The only hit prefix is [NULL_BLOCK, BLOCK_2] if eagle is
     # not considered. But after dropping the last matched block due to eagle,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 1c7dd0ca90b7..ac70c90d92ad 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -589,7 +589,7 @@ def test_preempt_during_execution():
                                  block_size=16,
                                  num_blocks=11,
                                  enable_prefix_caching=False)
-    requests = create_requests(num_requests=2, num_tokens=80)
+    requests = create_requests(num_requests=2, num_tokens=80, block_size=16)
 
     # Schedule the first request.
     scheduler.add_request(requests[0])
@@ -762,7 +762,7 @@ def _assert_right_scheduler_output(
 
 def _assert_right_kv_cache_manager(
     scheduler: Scheduler,
-    req_ids: list[str],
+    requests: list[Request],
     num_tokens: int,
     block_size: int,
     num_requests: int,
@@ -772,12 +772,12 @@ def _assert_right_kv_cache_manager(
 
     # Make sure the request stats are right.
     EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
-    for req_id in req_ids:
+    for req in requests:
         blocks = (scheduler.kv_cache_manager.coordinator.
-                  single_type_managers[0].req_to_blocks[req_id])
-        hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
+                  single_type_managers[0].req_to_blocks[req.request_id])
+        hashes = req.block_hashes
         assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
-                num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
+                num_cached_block[req.request_id] == EXPECTED_TOTAL_BLOCKS)
         assert len(blocks) == EXPECTED_TOTAL_BLOCKS
         assert len(hashes) == EXPECTED_TOTAL_BLOCKS
 
@@ -840,7 +840,8 @@ def test_kv_connector_basic():
     MAX_TOKENS = 3
     requests = create_requests(num_requests=NUM_REQUESTS,
                                num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+                               max_tokens=MAX_TOKENS,
+                               block_size=BLOCK_SIZE)
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -868,7 +869,7 @@ def test_kv_connector_basic():
     )
 
     # Ensure KVCacheManager is correct.
-    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+    _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE,
                                    NUM_REQUESTS, NUM_TOTAL_BLOCKS)
 
     # Continue Generation until done.
@@ -886,7 +887,8 @@ def test_kv_connector_basic():
     NUM_TOKENS = NUM_TOKENS_PREFIX * 2
     requests = create_requests(num_requests=NUM_REQUESTS,
                                num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+                               max_tokens=MAX_TOKENS,
+                               block_size=BLOCK_SIZE)
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -915,7 +917,7 @@ def test_kv_connector_basic():
                                        NUM_MATCHED_NEW_TOKENS))
 
     # Ensure KVCacheManager is correct.
-    _assert_right_kv_cache_manager(scheduler, req_ids, NUM_TOKENS, BLOCK_SIZE,
+    _assert_right_kv_cache_manager(scheduler, requests, NUM_TOKENS, BLOCK_SIZE,
                                    NUM_REQUESTS, NUM_TOTAL_BLOCKS)
 
     # Continue Generation until done.
@@ -953,7 +955,8 @@ def test_kv_connector_unable_to_allocate():
     MAX_TOKENS = 2
     requests = create_requests(num_requests=NUM_REQUESTS,
                                num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+                               max_tokens=MAX_TOKENS,
+                               block_size=BLOCK_SIZE)
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -1034,7 +1037,8 @@ def test_kv_connector_handles_preemption():
     MAX_TOKENS = BLOCK_SIZE * 2
     requests = create_requests(num_requests=NUM_REQUESTS,
                                num_tokens=NUM_TOKENS,
-                               max_tokens=MAX_TOKENS)
+                               max_tokens=MAX_TOKENS,
+                               block_size=BLOCK_SIZE)
     req_ids = []
     req_to_index = {}
     for i, request in enumerate(requests):
@@ -1162,7 +1166,6 @@ def assert_scheduler_empty(scheduler: Scheduler):
     # KVCache Manager.
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                req_to_blocks) == 0
-    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                num_cached_block) == 0
     num_free_blocks = (
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index b67c05bd7ac1..7dcebba491fa 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -17,7 +17,6 @@
 def get_sliding_window_manager(sliding_window_spec, block_pool):
     return SlidingWindowManager(sliding_window_spec,
                                 block_pool,
-                                caching_hash_fn=lambda x: x,
                                 kv_cache_group_id=0)
 
 
@@ -25,7 +24,6 @@ def get_chunked_local_attention_manager(chunked_local_attention_spec,
                                         block_pool):
     return ChunkedLocalAttentionManager(chunked_local_attention_spec,
                                         block_pool,
-                                        caching_hash_fn=lambda x: x,
                                         kv_cache_group_id=0)
 
 
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 484afe61fc3f..52093d3d381a 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -10,6 +10,8 @@
                                     MultiModalFieldElem, MultiModalKwargsItem,
                                     PlaceholderRange)
 from vllm.sampling_params import SamplingParams
+from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
+                                         init_none_hash)
 from vllm.v1.core.sched.async_scheduler import AsyncScheduler
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -114,6 +116,9 @@ def create_scheduler(
     )
 
 
+_none_hash_initialized = False
+
+
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
@@ -122,7 +127,14 @@ def create_requests(
     stop_token_ids: Optional[list[int]] = None,
     prompt_logprobs: Optional[int] = None,
     same_prompt: bool = False,
+    block_size: int = 16,
 ) -> list[Request]:
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(hash)
+        _none_hash_initialized = True
+
+    block_hasher = get_request_block_hasher(block_size, hash)
     sampling_params = SamplingParams(ignore_eos=False,
                                      max_tokens=max_tokens,
                                      stop_token_ids=stop_token_ids,
@@ -139,9 +151,11 @@ def create_requests(
             )
             mm_item = MultiModalKwargsItem.from_elems([mm_elem])
             mm_kwargs = [mm_item] * len(mm_position)
+            mm_hashes = ["hash"] * len(mm_position)
         else:
             mm_position = None
             mm_kwargs = None
+            mm_hashes = None
         prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
                             num_tokens)
         request = Request(
@@ -151,8 +165,9 @@ def create_requests(
             pooling_params=None,
             multi_modal_kwargs=mm_kwargs,
             multi_modal_placeholders=mm_position,
-            multi_modal_hashes=None,
+            multi_modal_hashes=mm_hashes,
             eos_token_id=EOS_TOKEN_ID,
+            block_hasher=block_hasher,
         )
         requests.append(request)
     return requests
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index b185936ab025..e6859ea73827 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -147,6 +147,7 @@ def test_basic_interface():
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True)
     request_id = request.request_id
@@ -186,6 +187,7 @@ def test_prompt_less_than_block_size():
 
     # Request will have 1 partial remote block.
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True,
                              num_remote_blocks=1)
diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
index 2f8228864e7b..d8c56ac42f71 100644
--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -21,6 +21,7 @@ def test_basic_lifecycle():
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              max_tokens=1,
                              num_tokens=NUM_TOKENS,
                              do_remote_decode=True)
@@ -103,8 +104,10 @@ def test_short_prompt_lifecycle():
     scheduler = create_scheduler(vllm_config)
 
     # Not enough tokens for full block.
-    NUM_TOKENS = vllm_config.cache_config.block_size // 2
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    NUM_TOKENS = BLOCK_SIZE // 2
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              max_tokens=1,
                              num_tokens=NUM_TOKENS,
                              do_remote_decode=True)
@@ -148,7 +151,9 @@ def test_prefix_cache_lifecycle():
     NUM_EXTERNAL_FULL_BLOCKS = 3
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
-    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS)
+    request_normal = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
+                                    num_tokens=NUM_TOKENS)
 
     scheduler.add_request(request_normal)
     scheduler_output = scheduler.schedule()
@@ -166,6 +171,7 @@ def test_prefix_cache_lifecycle():
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
     request_remote = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
                                     num_tokens=NUM_TOKENS,
                                     do_remote_decode=True)
 
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index 87f7490698a3..21fec5344255 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -23,6 +23,7 @@ def test_basic_lifecycle():
         scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
 
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True)
 
@@ -133,14 +134,17 @@ def test_interleaved_lifecycle():
     NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5))
 
     request_remote = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
                                     num_tokens=NUM_TOKENS,
                                     do_remote_prefill=True)
     request_local_a = create_request(
         request_id=2,
+        block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
     )
     request_local_b = create_request(
         request_id=3,
+        block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
     )
 
@@ -236,6 +240,7 @@ def test_no_spurious_prefix_caching():
     # Both of these requests have prompts like [1,1,1,1,1, ...]
     request_remote = create_request(
         request_id=1,
+        block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
         do_remote_prefill=True,
         use_all_1s_for_prompt_tokens=True,
@@ -243,6 +248,7 @@ def test_no_spurious_prefix_caching():
 
     request_local = create_request(
         request_id=2,
+        block_size=BLOCK_SIZE,
         num_tokens=NUM_TOKENS,
         do_remote_prefill=False,
         use_all_1s_for_prompt_tokens=True,
@@ -292,6 +298,7 @@ def test_full_block_prompt():
     NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS)
 
     request = create_request(request_id=1,
+                             block_size=BLOCK_SIZE,
                              num_tokens=NUM_TOKENS,
                              do_remote_prefill=True)
 
@@ -364,8 +371,11 @@ def test_cannot_schedule_after_recv():
     NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
     NUM_TOKENS_REMOTE = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
 
-    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
+    request_normal = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
+                                    num_tokens=NUM_TOKENS_LOCAL)
     request_remote = create_request(request_id=2,
+                                    block_size=BLOCK_SIZE,
                                     num_tokens=NUM_TOKENS_REMOTE,
                                     do_remote_prefill=True)
 
@@ -456,8 +466,11 @@ def test_cannot_recv():
     NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
     NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5))
 
-    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
+    request_normal = create_request(request_id=1,
+                                    block_size=BLOCK_SIZE,
+                                    num_tokens=NUM_TOKENS_LOCAL)
     request_remote = create_request(request_id=2,
+                                    block_size=BLOCK_SIZE,
                                     num_tokens=NUM_TOKENS_REMOTE,
                                     do_remote_prefill=True)
 
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 60847c48585c..8c5d132c00ae 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import tempfile
 from collections import defaultdict
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 
 import torch
 
@@ -14,6 +14,8 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector)
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
+                                         init_none_hash)
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
@@ -40,7 +42,6 @@ def assert_scheduler_empty(scheduler: Scheduler):
     # KVCache Manager.
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                req_to_blocks) == 0
-    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
     assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0].
                num_cached_block) == 0
     num_free_blocks = (
@@ -115,16 +116,23 @@ def create_scheduler(
     )
 
 
-def create_request(
-    request_id: int,
-    num_tokens: int = 10,
-    max_tokens: int = 16,
-    do_remote_decode: bool = False,
-    do_remote_prefill: bool = False,
-    use_all_1s_for_prompt_tokens: bool = False,
-    num_remote_blocks: int = 3,
-) -> Request:
+_none_hash_initialized = False
+
+
+def create_request(request_id: int,
+                   num_tokens: int = 10,
+                   max_tokens: int = 16,
+                   do_remote_decode: bool = False,
+                   do_remote_prefill: bool = False,
+                   use_all_1s_for_prompt_tokens: bool = False,
+                   num_remote_blocks: int = 3,
+                   block_size: int = 16,
+                   hash_fn: Callable = hash) -> Request:
     """Make dummy request for testing."""
+    global _none_hash_initialized
+    if not _none_hash_initialized:
+        init_none_hash(hash)
+        _none_hash_initialized = True
 
     kv_transfer_params: Optional[dict[str, Any]] = None
 
@@ -158,6 +166,7 @@ def create_request(
         multi_modal_placeholders=None,
         multi_modal_hashes=None,
         eos_token_id=EOS_TOKEN_ID,
+        block_hasher=get_request_block_hasher(block_size, hash_fn),
     )
     req.kv_transfer_params = kv_transfer_params
     return req
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index a1f8ad164762..72857ee2abc7 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3243,6 +3243,24 @@ def sha256_cbor_64bit(input) -> int:
     return full_hash & ((1 << 64) - 1)
 
 
+def get_hash_fn_by_name(hash_fn_name: str) -> Callable:
+    """Get a hash function by name, or raise an error if
+    the function is not found.
+    Args:
+        hash_fn_name: Name of the hash function.
+    Returns:
+        A hash function.
+    """
+    if hash_fn_name == "sha256":
+        return sha256
+    if hash_fn_name == "sha256_cbor_64bit":
+        return sha256_cbor_64bit
+    if hash_fn_name == "builtin":
+        return hash
+
+    raise ValueError(f"Unsupported hash function: {hash_fn_name}")
+
+
 def is_torch_equal_or_newer(target: str) -> bool:
     """Check if the installed torch version is >= the target version.
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index ad9854dd29c3..839297135fe0 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -2,15 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Callable, Optional
+from typing import Optional
 
 from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
                                         BlockStored, KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
-                                         FreeKVCacheBlockQueue, KVCacheBlock,
-                                         generate_block_hash_extra_keys,
-                                         hash_block_tokens)
+                                         FreeKVCacheBlockQueue, KVCacheBlock)
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -97,84 +95,39 @@ def cache_full_blocks(
         self,
         request: Request,
         blocks: list[KVCacheBlock],
-        block_hashes: list[BlockHash],
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
         kv_cache_group_id: int,
-        hash_fn: Callable,
     ) -> None:
         """Cache a list of full blocks for prefix caching.
         This function takes a list of blocks that will have their block hash
-        metadata to be updated and cached. Given a request, it computes the
-        block hashes for the blocks starting from `num_cached_blocks` to
-        `num_full_blocks`, updating the metadata for each block
-        and caching them in the `cached_block_hash_to_block`.
+        metadata to be updated and cached. Given a request, it updates the
+        metadata for each block and caching it in the
+        `cached_block_hash_to_block`.
+        The block hashes values are computed by the Request object immediately
+        when it is created and when new tokens are appended.
 
         Args:
             request: The request to cache the blocks.
             blocks: All blocks in the request.
-            block_hashes: Block hashes of the blocks in the request. Note that
-            this list may be shorter than the blocks list. In this case the
-            missed block hash will be computed in this function.
             num_cached_blocks: The number of blocks that are already cached.
             num_full_blocks: The number of blocks that are full and should
                 be cached after this function.
             block_size: Number of tokens in each block.
             kv_cache_group_id: The id of the KV cache group.
-            hash_fn: The hash function to use for block hashes.
         """
         if num_cached_blocks == num_full_blocks:
             return
         new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
-        assert len(block_hashes) >= num_cached_blocks
-        new_block_hashes = block_hashes[num_cached_blocks:]
+        assert len(request.block_hashes) >= num_full_blocks
+        new_block_hashes = request.block_hashes[num_cached_blocks:]
 
-        # Update the new blocks with the block hashes through the chain.
-        if num_cached_blocks == 0:
-            prev_block_hash_value = None
-        else:
-            prev_block = blocks[num_cached_blocks - 1]
-            assert prev_block.block_hash is not None
-            prev_block_hash_value = prev_block.block_hash.get_hash_value()
-
-        parent_block_hash = prev_block_hash_value
         new_hashes: Optional[list[int]] = ([] if self.enable_kv_cache_events
                                            else None)
         for i, blk in enumerate(new_full_blocks):
             assert blk.block_hash is None
-
-            if i < len(new_block_hashes):
-                # The block hash may already be computed in
-                # "get_computed_blocks" if the tokens are not generated by
-                # this request (either the prompt tokens or the previously
-                # generated tokens with preemption), or by other
-                # single_type_managers with the same block_size.
-                # In this case we simply reuse the block hash.
-                block_hash = new_block_hashes[i]
-            else:
-                # Otherwise compute the block hash and cache it in the request
-                # in case it will be preempted in the future.
-                blk_idx = num_cached_blocks + i
-                start_token_idx = blk_idx * block_size
-                end_token_idx = (blk_idx + 1) * block_size
-                block_tokens = request.all_token_ids[
-                    start_token_idx:end_token_idx]
-                assert len(block_tokens) == block_size, (
-                    f"Expected {block_size} tokens, got "
-                    f"{len(block_tokens)} at {blk_idx}th block for request "
-                    f"{request.request_id}({request})")
-
-                # Generate extra keys for multi-modal inputs. Note that since
-                # we reach to this branch only when the block is completed with
-                # generated tokens, we only need to consider the last mm input.
-                extra_keys, _ = generate_block_hash_extra_keys(
-                    request, start_token_idx, end_token_idx, -1)
-
-                # Compute the hash of the current block.
-                block_hash = hash_block_tokens(hash_fn, prev_block_hash_value,
-                                               block_tokens, extra_keys)
-                block_hashes.append(block_hash)
+            block_hash = new_block_hashes[i]
 
             # Update and added the full block to the cache.
             block_hash_with_group_id = BlockHashWithGroupId(
@@ -184,9 +137,15 @@ def cache_full_blocks(
                 blk.block_id] = blk
             if new_hashes is not None:
                 new_hashes.append(block_hash.hash_value)
-            prev_block_hash_value = block_hash.hash_value
 
         if self.enable_kv_cache_events:
+            if num_cached_blocks == 0:
+                parent_block_hash = None
+            else:
+                parent_block = blocks[num_cached_blocks - 1]
+                assert parent_block.block_hash is not None
+                parent_block_hash = parent_block.block_hash.get_hash_value()
+
             self.kv_event_queue.append(
                 BlockStored(
                     block_hashes=new_hashes,
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index f3a16d64e19f..a0ea4d96015a 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
-from typing import Callable, Optional
+from typing import Optional
 
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
@@ -23,7 +23,6 @@ def __init__(
         max_model_len: int,
         use_eagle: bool,
         enable_caching: bool,
-        caching_hash_fn: Callable,
         enable_kv_cache_events: bool,
     ):
         self.kv_cache_config = kv_cache_config
@@ -40,7 +39,6 @@ def __init__(
                 kv_cache_spec=kv_cache_group.kv_cache_spec,
                 block_pool=self.block_pool,
                 kv_cache_group_id=i,
-                caching_hash_fn=caching_hash_fn,
             ) for i, kv_cache_group in enumerate(
                 self.kv_cache_config.kv_cache_groups))
 
@@ -99,19 +97,17 @@ def allocate_new_blocks(self, request_id: str,
             manager.allocate_new_blocks(request_id, num_tokens)
             for manager in self.single_type_managers)
 
-    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
-                     num_computed_tokens: int) -> None:
+    def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
         """
         Cache the blocks for the request.
 
         Args:
             request: The request.
-            block_hashes: The block hashes of the request.
             num_tokens: The total number of tokens that need to be cached 
                 (including tokens that are already cached).
         """
         for manager in self.single_type_managers:
-            manager.cache_blocks(request, block_hashes, num_computed_tokens)
+            manager.cache_blocks(request, num_computed_tokens)
 
     def free(self, request_id: str) -> None:
         """
@@ -184,10 +180,9 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator):
     """
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
-                 use_eagle: bool, caching_hash_fn: Callable,
-                 enable_kv_cache_events: bool):
+                 use_eagle: bool, enable_kv_cache_events: bool):
         super().__init__(kv_cache_config, max_model_len, use_eagle, False,
-                         caching_hash_fn, enable_kv_cache_events)
+                         enable_kv_cache_events)
         self.num_single_type_manager = len(self.single_type_managers)
 
     def get_num_common_prefix_blocks(self, request_id: str,
@@ -213,10 +208,9 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator):
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                  use_eagle: bool, enable_caching: bool,
-                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+                 enable_kv_cache_events: bool):
         super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, caching_hash_fn,
-                         enable_kv_cache_events)
+                         enable_caching, enable_kv_cache_events)
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[
             0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
@@ -250,10 +244,9 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
 
     def __init__(self, kv_cache_config: KVCacheConfig, max_model_len: int,
                  use_eagle: bool, enable_caching: bool,
-                 caching_hash_fn: Callable, enable_kv_cache_events: bool):
+                 enable_kv_cache_events: bool):
         super().__init__(kv_cache_config, max_model_len, use_eagle,
-                         enable_caching, caching_hash_fn,
-                         enable_kv_cache_events)
+                         enable_caching, enable_kv_cache_events)
         self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
@@ -386,17 +379,15 @@ def find_longest_cache_hit(
 
 def get_kv_cache_coordinator(
         kv_cache_config: KVCacheConfig, max_model_len: int, use_eagle: bool,
-        enable_caching: bool, caching_hash_fn: Callable,
+        enable_caching: bool,
         enable_kv_cache_events: bool) -> KVCacheCoordinator:
     if not enable_caching:
         return KVCacheCoordinatorNoPrefixCache(kv_cache_config, max_model_len,
-                                               use_eagle, caching_hash_fn,
+                                               use_eagle,
                                                enable_kv_cache_events)
     if len(kv_cache_config.kv_cache_groups) == 1:
         return UnitaryKVCacheCoordinator(kv_cache_config, max_model_len,
                                          use_eagle, enable_caching,
-                                         caching_hash_fn,
                                          enable_kv_cache_events)
     return HybridKVCacheCoordinator(kv_cache_config, max_model_len, use_eagle,
-                                    enable_caching, caching_hash_fn,
-                                    enable_kv_cache_events)
+                                    enable_caching, enable_kv_cache_events)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index ce333dbe61a1..bfaa7ab08f5c 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,16 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections import defaultdict
 from dataclasses import dataclass
 from typing import Optional
 
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
-from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
-from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
-                                         hash_request_tokens, init_none_hash)
+from vllm.v1.core.kv_cache_utils import KVCacheBlock
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
@@ -71,23 +68,13 @@ def __init__(
         kv_cache_config: KVCacheConfig,
         max_model_len: int,
         enable_caching: bool = True,
-        caching_hash_algo: str = "builtin",
         use_eagle: bool = False,
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
     ) -> None:
         self.max_model_len = max_model_len
 
-        if len(kv_cache_config.kv_cache_groups) == 0:
-            # Attention free models don't have kv cache,
-            # thus don't need prefix caching.
-            enable_caching = False
         self.enable_caching = enable_caching
-
-        self.caching_hash_fn = (
-            sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
-            sha256 if caching_hash_algo == "sha256" else hash)
-        init_none_hash(self.caching_hash_fn)
         self.use_eagle = use_eagle
         self.log_stats = log_stats
         # FIXME: make prefix cache stats conditional on log_stats
@@ -107,19 +94,12 @@ def __init__(
             max_model_len=self.max_model_len,
             use_eagle=self.use_eagle,
             enable_caching=self.enable_caching,
-            caching_hash_fn=self.caching_hash_fn,
             enable_kv_cache_events=enable_kv_cache_events,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
         self.kv_cache_config = kv_cache_config
 
-        # Mapping from request ID to kv block hashes.
-        # This is to avoid recomputing the block hashes for each call of
-        # `get_computed_blocks` or `allocate_slots`.
-        self.req_to_block_hashes: defaultdict[
-            str, list[BlockHash]] = defaultdict(list)
-
     @property
     def usage(self) -> float:
         """Get the KV cache usage.
@@ -161,15 +141,6 @@ def get_computed_blocks(self,
                     and request.sampling_params.prompt_logprobs is not None)):
             return self.create_empty_block_list(), 0
 
-        # The block hashes for the request may already be computed
-        # if the scheduler has tried to schedule the request before.
-        block_hashes = self.req_to_block_hashes[request.request_id]
-        if not block_hashes:
-            assert self.block_size is not None
-            block_hashes = hash_request_tokens(self.caching_hash_fn,
-                                               self.block_size, request)
-            self.req_to_block_hashes[request.request_id] = block_hashes
-
         # NOTE: When all tokens hit the cache, we must recompute the last token
         # to obtain logits. Thus, set max_cache_hit_length to prompt_length - 1.
         # This can trigger recomputation of an entire block, rather than just
@@ -178,7 +149,7 @@ def get_computed_blocks(self,
         # could slightly improve performance in the future.
         max_cache_hit_length = request.num_tokens - 1
         computed_blocks, num_new_computed_tokens = (
-            self.coordinator.find_longest_cache_hit(block_hashes,
+            self.coordinator.find_longest_cache_hit(request.block_hashes,
                                                     max_cache_hit_length))
 
         if self.log_stats:
@@ -296,11 +267,7 @@ def allocate_slots(
         # at `request.num_tokens`, ensuring only "finalized" tokens are cached.
         num_tokens_to_cache = min(num_computed_tokens + num_new_tokens,
                                   request.num_tokens)
-        self.coordinator.cache_blocks(
-            request,
-            self.req_to_block_hashes[request.request_id],
-            num_tokens_to_cache,
-        )
+        self.coordinator.cache_blocks(request, num_tokens_to_cache)
 
         return KVCacheBlocks(new_blocks)
 
@@ -373,14 +340,6 @@ def get_num_common_prefix_blocks(
         return self.coordinator.get_num_common_prefix_blocks(
             request.request_id, num_running_requests)
 
-    def free_block_hashes(self, request: Request) -> None:
-        """Discard the block hashes for the request.
-
-        NOTE: Unlike `free`, this method should be called only when the request
-        is finished, not when it is preempted.
-        """
-        self.req_to_block_hashes.pop(request.request_id, None)
-
     def take_events(self) -> list[KVCacheEvent]:
         """Take the KV cache events from the block pool.
 
@@ -397,9 +356,7 @@ def get_block_ids(self, request_id: str) -> tuple[list[int], ...]:
     def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
         """Cache the blocks for the request, if enabled."""
         if self.enable_caching:
-            block_hashes = self.req_to_block_hashes[request.request_id]
-            self.coordinator.cache_blocks(request, block_hashes,
-                                          num_computed_tokens)
+            self.coordinator.cache_blocks(request, num_computed_tokens)
 
     def create_empty_block_list(self) -> KVCacheBlocks:
         """Creates a new KVCacheBlocks instance with no blocks."""
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 626aa35a770c..6a62c55fb2d5 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -547,41 +547,61 @@ def hash_block_tokens(
         curr_block_token_ids_tuple, extra_keys)
 
 
-def hash_request_tokens(hash_function: Any, block_size: int,
-                        request: Request) -> list[BlockHash]:
-    """Computes hash values of a chain of blocks given a sequence of
-    token IDs. The hash value is used for prefix caching.
+def get_request_block_hasher(
+    block_size: int,
+    caching_hash_fn: Callable[[Any],
+                              int]) -> Callable[[Request], list[BlockHash]]:
+    """
+    Returns a function which computes the list of un-computed block hashes
+    of a request.
+
+    Each request holds a list of its block hashes (request.block_hashes).
+    When a request is created, it calls the below function to compute
+    the hashes of all full blocks of the request's initial tokens.
+    The hashes are then stored in request.block_hashes.
+    Later, whenever new tokens are appended to the request, it calls
+    the below function again to compute any new full blocks of tokens.
+    The returned new hashes are appended to request.block_hashes.
+    """
 
-    Args:
-        block_size: The size of each block.
-        request: The request object.
+    def request_block_hasher(request: Request) -> list[BlockHash]:
+        start_token_idx = len(request.block_hashes) * block_size
+        num_tokens = request.num_tokens
+
+        curr_mm_idx = 0
+        if start_token_idx > 0:
+            # Set curr_mm_idx = -1 to indicate the last mm input.
+            # Note that since we reach to this branch only when the block is
+            # completed with generated tokens, we only need to consider the
+            # last mm input.
+            curr_mm_idx = -1
+
+        prev_block_hash_value = request.block_hashes[-1].hash_value \
+            if request.block_hashes else None
+        new_block_hashes: list[BlockHash] = []
+        while True:
+            end_token_idx = start_token_idx + block_size
+            if end_token_idx > num_tokens:
+                # We only hash full blocks
+                break
 
-    Returns:
-        The list of computed hash values.
-    """
-    token_ids = request.all_token_ids
+            # MM and LoRA requests need extra keys for block-hash computation.
+            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+                request, start_token_idx, end_token_idx, curr_mm_idx)
 
-    req_need_extra_keys = need_extra_keys(request)
-    req_extra_keys = None
-    curr_mm_idx = 0
+            # Compute the hash of the current block
+            block_tokens = request.all_token_ids[start_token_idx:end_token_idx]
+            block_hash = hash_block_tokens(caching_hash_fn,
+                                           prev_block_hash_value, block_tokens,
+                                           extra_keys)
 
-    ret = []
-    parent_block_hash_value = None
-    # Only full blocks will be hashed
-    for start in range(0, len(token_ids) - block_size + 1, block_size):
-        end = start + block_size
-        block_token_ids = token_ids[start:end]
+            new_block_hashes.append(block_hash)
+            start_token_idx += block_size
+            prev_block_hash_value = block_hash.hash_value
 
-        if req_need_extra_keys:
-            # MM and LoRA requests need extra keys for block-hash computation.
-            req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
-                request, start, end, curr_mm_idx)
-
-        block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
-                                       block_token_ids, req_extra_keys)
-        ret.append(block_hash)
-        parent_block_hash_value = block_hash.hash_value
-    return ret
+        return new_block_hashes
+
+    return request_block_hasher
 
 
 def max_memory_usage_bytes(vllm_config: VllmConfig,
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index dcb9f4dd36f5..981023409045 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -155,7 +155,6 @@ def __init__(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
             enable_caching=self.cache_config.enable_prefix_caching,
-            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
@@ -1036,7 +1035,6 @@ def _free_request(self, request: Request) -> Optional[dict[str, Any]]:
     def _free_blocks(self, request: Request):
         assert request.is_finished()
         self.kv_cache_manager.free(request)
-        self.kv_cache_manager.free_block_hashes(request)
         del self.requests[request.request_id]
 
     def get_num_unfinished_requests(self) -> int:
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 8f310023a8cd..82e0292522b9 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -3,7 +3,6 @@
 import itertools
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import Callable
 
 from vllm.utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
@@ -25,7 +24,6 @@ def __init__(
         kv_cache_spec: KVCacheSpec,
         block_pool: BlockPool,
         kv_cache_group_id: int,
-        caching_hash_fn: Callable,
     ) -> None:
         """
         Initializes the SingleTypeKVCacheManager.
@@ -33,7 +31,6 @@ def __init__(
             kv_cache_spec: The kv_cache_spec for this manager.
             block_pool: The block pool.
             kv_cache_group_id: The id of the kv cache group of this manager.
-            caching_hash_fn: The caching hash function.
         """
 
         self.block_size = kv_cache_spec.block_size
@@ -52,7 +49,6 @@ def __init__(
         # data for reempted ones.
         self.num_cached_block: dict[str, int] = {}
 
-        self.caching_hash_fn = caching_hash_fn
         self.kv_cache_group_id = kv_cache_group_id
         self._null_block = block_pool.null_block
 
@@ -130,14 +126,12 @@ def allocate_new_blocks(self, request_id: str,
             req_blocks.extend(new_blocks)
             return new_blocks
 
-    def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
-                     num_tokens: int) -> None:
+    def cache_blocks(self, request: Request, num_tokens: int) -> None:
         """
         Cache the blocks for the request.
 
         Args:
             request: The request.
-            block_hashes: The block hashes of the request.
             num_tokens: The total number of tokens that need to be cached 
                 (including tokens that are already cached).
         """
@@ -147,12 +141,10 @@ def cache_blocks(self, request: Request, block_hashes: list[BlockHash],
         self.block_pool.cache_full_blocks(
             request=request,
             blocks=self.req_to_blocks[request.request_id],
-            block_hashes=block_hashes,
             num_cached_blocks=num_cached_blocks,
             num_full_blocks=num_full_blocks,
             block_size=self.block_size,
             kv_cache_group_id=self.kv_cache_group_id,
-            hash_fn=self.caching_hash_fn,
         )
 
         self.num_cached_block[request.request_id] = num_full_blocks
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ed426f8ff452..1e52f93a581b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -25,9 +25,11 @@
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import (decorate_logs, make_zmq_socket,
+from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket,
                         resolve_obj_by_qualname, set_process_title)
-from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
+from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_config,
+                                         get_request_block_hasher,
+                                         init_none_hash,
                                          unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -140,6 +142,19 @@ def __init__(self,
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
 
+        self.request_block_hasher: Optional[Callable[[Request],
+                                                     list[BlockHash]]] = None
+        if (self.vllm_config.cache_config.enable_prefix_caching
+                or self.scheduler.get_kv_connector() is not None):
+
+            block_size = vllm_config.cache_config.block_size
+            caching_hash_fn = get_hash_fn_by_name(
+                vllm_config.cache_config.prefix_caching_hash_algo)
+            init_none_hash(caching_hash_fn)
+
+            self.request_block_hasher = get_request_block_hasher(
+                block_size, caching_hash_fn)
+
     def _initialize_kv_caches(
             self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
         start = time.time()
@@ -417,7 +432,8 @@ def preprocess_add_request(
             request.mm_kwargs = self.mm_input_cache_server.get_and_update(
                 request.mm_kwargs, request.mm_hashes)
 
-        req = Request.from_engine_core_request(request)
+        req = Request.from_engine_core_request(request,
+                                               self.request_block_hasher)
         if req.use_structured_output:
             # Note on thread safety: no race condition.
             # `grammar_init` is only invoked in input processing thread. For
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index d1f1c7f98755..562925bde669 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,7 +3,8 @@
 
 import enum
 import time
-from typing import TYPE_CHECKING, Any, Optional, Union
+from functools import partial
+from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
 from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.pooling_params import PoolingParams
@@ -16,6 +17,7 @@
 
 if TYPE_CHECKING:
     from vllm.lora.request import LoRARequest
+    from vllm.v1.core.kv_cache_utils import BlockHash
 
 
 class Request:
@@ -36,6 +38,8 @@ def __init__(
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
         priority: int = 0,
+        block_hasher: Optional[Callable[["Request"],
+                                        list["BlockHash"]]] = None,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
@@ -108,8 +112,18 @@ def __init__(
         # indicates that the output is corrupted
         self.num_nans_in_logits = 0
 
+        self.block_hashes: list[BlockHash] = []
+        self.get_hash_new_full_blocks: Optional[Callable[
+            [], list[BlockHash]]] = None
+        if block_hasher is not None:
+            self.get_hash_new_full_blocks = partial(block_hasher, self)
+            self.block_hashes = self.get_hash_new_full_blocks()
+
     @classmethod
-    def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
+    def from_engine_core_request(
+        cls, request: EngineCoreRequest,
+        block_hasher: Optional[Callable[["Request"], list["BlockHash"]]]
+    ) -> "Request":
         if request.mm_kwargs is not None:
             assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), (
                 "mm_kwargs was not updated in EngineCore.add_request")
@@ -131,6 +145,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
                     if request.sampling_params else None,
             cache_salt=request.cache_salt,
             priority=request.priority,
+            block_hasher=block_hasher,
         )
 
     def append_output_token_ids(
@@ -144,6 +159,9 @@ def append_output_token_ids(
             self._output_token_ids.extend(token_ids)
             self._all_token_ids.extend(token_ids)
 
+        if self.get_hash_new_full_blocks is not None:
+            self.block_hashes.extend(self.get_hash_new_full_blocks())
+
     @property
     def is_output_corrupted(self) -> bool:
         return self.num_nans_in_logits > 0

From 6267ed16d358effd80b6c9fc9286d6188c291beb Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 15 Aug 2025 16:54:10 -0700
Subject: [PATCH 118/233] Support multiple attention groups for KV sharing
 (#22672)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/test_kv_sharing.py | 189 ++++++++++++++++++++++++++++++++++++
 vllm/v1/worker/utils.py     |  40 +++++---
 2 files changed, 213 insertions(+), 16 deletions(-)
 create mode 100644 tests/v1/test_kv_sharing.py

diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/test_kv_sharing.py
new file mode 100644
index 000000000000..6b01b7d3e1d6
--- /dev/null
+++ b/tests/v1/test_kv_sharing.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock
+
+import torch
+
+from vllm.v1.attention.backends.flash_attn import (
+    FlashAttentionBackend, FlashAttentionMetadataBuilder)
+from vllm.v1.attention.backends.flex_attention import (
+    FlexAttentionBackend, FlexAttentionMetadataBuilder)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
+from vllm.v1.worker.utils import (AttentionGroup,
+                                  initialize_kv_cache_for_kv_sharing)
+
+
+def new_kv_cache_spec():
+    return FullAttentionSpec(16, 1, 1, torch.float32, False)
+
+
+def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
+    """
+    Test initializing KV cache sharing with different attention groups.
+    Layers in the same KV cache group might be placed in different attn groups
+    if they have different attention backends.
+    """
+    shared_kv_cache_layers = {
+        "model.layers.2": "model.layers.0",
+        "model.layers.3": "model.layers.1",
+    }
+
+    # Layers 0 and 1 both belong in KV cache group 0
+    # However, if they have have different attention backends, they will be
+    # placed in different attention groups for KV cache group 0
+    kv_cache_groups = [
+        KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
+                         new_kv_cache_spec()),
+    ]
+
+    attn_groups = [
+        # KV cache group 0 has two attention groups
+        [
+            AttentionGroup(
+                backend=FlashAttentionBackend,
+                metadata_builder=Mock(spec=FlashAttentionMetadataBuilder),
+                layer_names=["model.layers.0"],
+            ),
+            AttentionGroup(
+                backend=FlexAttentionBackend,
+                metadata_builder=Mock(spec=FlexAttentionMetadataBuilder),
+                layer_names=["model.layers.1"],
+            ),
+        ],
+    ]
+
+    # Only layers 0 and 1 will have KV caches allocated
+    kv_caches = {
+        "model.layers.0": torch.zeros(1, 2, 3),
+        "model.layers.1": torch.ones(1, 2, 3),
+    }
+
+    initialize_kv_cache_for_kv_sharing(
+        shared_kv_cache_layers=shared_kv_cache_layers,
+        kv_cache_groups=kv_cache_groups,
+        kv_caches=kv_caches,
+        attn_groups=attn_groups,
+    )
+
+    # Check that the KV caches were shared correctly
+    assert kv_caches["model.layers.2"].data_ptr(
+    ) == kv_caches["model.layers.0"].data_ptr()
+    assert kv_caches["model.layers.3"].data_ptr(
+    ) == kv_caches["model.layers.1"].data_ptr()
+
+    # Check that the layers were added to the correct KV cache group
+    assert len(kv_cache_groups) == 1
+    assert kv_cache_groups[0].layer_names == [
+        "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
+    ]
+
+    # Check that the layers were added to the attention groups
+    assert len(attn_groups) == 1 and len(attn_groups[0]) == 2
+    assert attn_groups[0][0].layer_names == [
+        "model.layers.0", "model.layers.2"
+    ]
+    assert attn_groups[0][1].layer_names == [
+        "model.layers.1", "model.layers.3"
+    ]
+
+
+def test_initialize_kv_cache_for_kv_sharing_same_attn_groups():
+    """
+    Test case assuming that all layers in the same KV cache group have the same
+    attention backends. This is true for most models.
+    """
+    shared_kv_cache_layers = {
+        "model.layers.2": "model.layers.0",
+        "model.layers.3": "model.layers.1",
+    }
+
+    kv_cache_groups = [
+        KVCacheGroupSpec(["model.layers.0", "model.layers.1"],
+                         new_kv_cache_spec()),
+    ]
+
+    attn_groups = [
+        # KV cache group 0 has a single attention group
+        # as all layers have the same flash attention backend
+        [
+            AttentionGroup(
+                backend=FlashAttentionBackend,
+                metadata_builder=Mock(spec=FlashAttentionMetadataBuilder),
+                layer_names=["model.layers.0", "model.layers.1"],
+            ),
+        ],
+    ]
+
+    kv_caches = {
+        "model.layers.0": torch.zeros(1, 2, 3),
+        "model.layers.1": torch.ones(1, 2, 3),
+    }
+
+    initialize_kv_cache_for_kv_sharing(
+        shared_kv_cache_layers=shared_kv_cache_layers,
+        kv_cache_groups=kv_cache_groups,
+        kv_caches=kv_caches,
+        attn_groups=attn_groups,
+    )
+
+    # Check that the KV caches were shared correctly
+    assert kv_caches["model.layers.2"].data_ptr(
+    ) == kv_caches["model.layers.0"].data_ptr()
+    assert kv_caches["model.layers.3"].data_ptr(
+    ) == kv_caches["model.layers.1"].data_ptr()
+
+    # Check that the layers were added to the correct KV cache group
+    assert len(kv_cache_groups) == 1
+    assert kv_cache_groups[0].layer_names == [
+        "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
+    ]
+
+    # Check that the layers were added to the attention groups
+    assert len(attn_groups) == 1 and len(attn_groups[0]) == 1
+    assert attn_groups[0][0].layer_names == [
+        "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3"
+    ]
+
+
+def test_initialize_kv_cache_for_kv_sharing_no_attn_groups():
+    """
+    Test KV sharing set up when no attention groups are provided.
+    This is the case for the TPU model runner, which doesn't have 
+    support for attention groups yet.
+    """
+    shared_kv_cache_layers = {
+        "model.layers.2": "model.layers.0",
+        "model.layers.3": "model.layers.1",
+    }
+
+    kv_cache_groups = [
+        KVCacheGroupSpec(["model.layers.0"], new_kv_cache_spec()),
+        KVCacheGroupSpec(["model.layers.1"], new_kv_cache_spec()),
+    ]
+
+    kv_caches = {
+        "model.layers.0": torch.zeros(1, 2, 3),
+        "model.layers.1": torch.ones(1, 2, 3),
+    }
+
+    initialize_kv_cache_for_kv_sharing(
+        shared_kv_cache_layers=shared_kv_cache_layers,
+        kv_cache_groups=kv_cache_groups,
+        kv_caches=kv_caches,
+    )
+
+    # Check that the KV caches were shared correctly
+    assert kv_caches["model.layers.2"].data_ptr(
+    ) == kv_caches["model.layers.0"].data_ptr()
+    assert kv_caches["model.layers.3"].data_ptr(
+    ) == kv_caches["model.layers.1"].data_ptr()
+
+    # Check that the layers were added to the correct KV cache group
+    assert len(kv_cache_groups) == 2
+    assert kv_cache_groups[0].layer_names == [
+        "model.layers.0", "model.layers.2"
+    ]
+    assert kv_cache_groups[1].layer_names == [
+        "model.layers.1", "model.layers.3"
+    ]
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index e7079235d651..b138f11af1eb 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -225,26 +225,34 @@ def initialize_kv_cache_for_kv_sharing(
             Note that layers in shared_kv_cache_layers.keys() are not
             originally included as it only contains layers which have its own
             KV cache allocation.
+        attn_groups: Optional list of attention groups. Layers in the same KV
+            cache group may be placed in different attention groups if they
+            have different attention backends.  Currently only provided by 
+            GPU model runner.
     """
-    # Record index of KV cache group for each layer that allocates a KV cache.
-    layer_to_kv_cache_group_idx: dict[str, int] = {}
-    for i, kv_cache_group in enumerate(kv_cache_groups):
-        for layer_name in kv_cache_group.layer_names:
-            layer_to_kv_cache_group_idx[layer_name] = i
+    # mapping from layer name to tuple of (kv_cache_group_idx, attn_group_idx)
+    layer_to_attn_group_idx: dict[str, tuple[int, int]] = {}
+    if attn_groups:
+        for kv_cache_group_idx, kv_attn_groups in enumerate(attn_groups):
+            for attn_group_idx, attn_group in enumerate(kv_attn_groups):
+                for layer_name in attn_group.layer_names:
+                    layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx,
+                                                           attn_group_idx)
+    else:
+        for kv_cache_group_idx, kv_cache_group in enumerate(kv_cache_groups):
+            for layer_name in kv_cache_group.layer_names:
+                # attn group idx default to 0 if not provided
+                layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, 0)
 
     for layer_name, target_layer_name in shared_kv_cache_layers.items():
         kv_caches[layer_name] = kv_caches[target_layer_name]
-        group_idx = layer_to_kv_cache_group_idx[target_layer_name]
-        kv_cache_groups[group_idx].layer_names.append(layer_name)
-
-        if attn_groups is not None:
-            assert len(attn_groups[group_idx]) == 1, (
-                "Only one attention group per KV cache group is supported "
-                "for KV-cache sharing for now.")
-            # TODO(lucas): I think in the future the layers that re-use a
-            # KV cache will be in a different attention group so we can
-            # remove this code from here.
-            attn_groups[group_idx][0].layer_names.append(layer_name)
+        kv_cache_group_idx = layer_to_attn_group_idx[target_layer_name][0]
+        kv_cache_groups[kv_cache_group_idx].layer_names.append(layer_name)
+
+        if attn_groups:
+            attn_group_idx = layer_to_attn_group_idx[target_layer_name][1]
+            attn_groups[kv_cache_group_idx][attn_group_idx].layer_names.append(
+                layer_name)
 
 
 def bind_kv_cache(

From 706e57c77fae79346f335419a19ed63ede80eea9 Mon Sep 17 00:00:00 2001
From: Yichen Yan <oraluben@outlook.com>
Date: Sat, 16 Aug 2025 07:56:17 +0800
Subject: [PATCH 119/233] [BugFix] Make `run_once` thread-safe (#22978)

Signed-off-by: <wenji.yyc@alibaba-inc.com>
Signed-off-by: Yichen Yan <wenji.yyc@alibaba-inc.com>
---
 vllm/utils/__init__.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 72857ee2abc7..40f41893abb6 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1640,15 +1640,19 @@ def weak_bound(*args, **kwargs) -> None:
     return weak_bound
 
 
-# From: https://stackoverflow.com/a/4104188/2749989
 def run_once(f: Callable[P, None]) -> Callable[P, None]:
 
     def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
-        if not wrapper.has_run:  # type: ignore[attr-defined]
-            wrapper.has_run = True  # type: ignore[attr-defined]
-            return f(*args, **kwargs)
+        if wrapper.has_run:  # type: ignore[attr-defined]
+            return
+
+        with wrapper.lock:  # type: ignore[attr-defined]
+            if not wrapper.has_run:  # type: ignore[attr-defined]
+                wrapper.has_run = True  # type: ignore[attr-defined]
+                return f(*args, **kwargs)
 
     wrapper.has_run = False  # type: ignore[attr-defined]
+    wrapper.lock = threading.Lock()  # type: ignore[attr-defined]
     return wrapper
 
 
From fac3fcb8725cb90404f0626cdef3f3962eeb6349 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 15 Aug 2025 17:00:36 -0700
Subject: [PATCH 120/233] [Misc] Support passing multiple request ids at once
 to `AsyncLLM.abort()` (#22944)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_async_llm.py     | 77 ++++++++++++++++++++++++++-
 vllm/engine/async_llm_engine.py       |  5 +-
 vllm/engine/multiprocessing/client.py | 10 ++--
 vllm/engine/protocol.py               |  7 +--
 vllm/utils/__init__.py                |  5 ++
 vllm/v1/engine/async_llm.py           | 15 +++---
 6 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 484640233f52..df04a14af70c 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -212,6 +212,79 @@ async def test_abort(
         assert not engine.output_processor.has_unfinished_requests()
 
 
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_multi_abort(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+):
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 50
+        NUM_EXPECTED_TOKENS = 100
+        NUM_EXPECTED_TOKENS_LONG = 50000
+        REQUEST_IDS_TO_ABORT = [5, 10, 15, 20, 25]
+        PARALLEL_SAMPLE_REQ_IDS = [5, 15, 30, 35]
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks: list[asyncio.Task] = []
+        for idx, request_id in enumerate(request_ids):
+            max_tokens = (NUM_EXPECTED_TOKENS_LONG if
+                          (idx
+                           in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS)
+            n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, TEXT_PROMPT, output_kind,
+                             max_tokens, n)))
+
+        # Let requests start
+        await asyncio.sleep(0.5)
+
+        # Use multi-abort to abort multiple requests at once
+        abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
+        await engine.abort(abort_request_ids)
+
+        # Wait for all tasks to complete
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Verify results
+        for idx, result in enumerate(results):
+            if idx in REQUEST_IDS_TO_ABORT:
+                # Aborted requests should return partial results
+                assert isinstance(
+                    result, tuple
+                ), f"Request {idx} should have completed with partial results"
+                num_generated_tokens, request_id = result
+                # Should have generated some tokens before abort
+                assert num_generated_tokens > 0, (
+                    f"Aborted request "
+                    f"{request_id} should have generated some tokens")
+            else:
+                # Non-aborted requests should complete normally
+                assert isinstance(
+                    result,
+                    tuple), f"Request {idx} should have completed successfully"
+                num_generated_tokens, request_id = result
+                n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
+                expected_tokens = NUM_EXPECTED_TOKENS * n
+                assert num_generated_tokens == expected_tokens, (
+                    f"{request_id} generated {num_generated_tokens} but "
+                    f"expected {expected_tokens}")
+
+        # Make sure all aborted requests were cleaned up
+        assert not engine.output_processor.has_unfinished_requests()
+
+
 @pytest.mark.parametrize("n", [1, 3])
 @pytest.mark.parametrize(
     "engine_args,prompt",
@@ -460,7 +533,9 @@ async def test_abort_final_output(
             token_count = sum(
                 len(output.outputs[0].token_ids) for output in outputs)
             assert token_count > 0
-            assert len(final_output.outputs[0].token_ids) == 0
+            # This would ordinarily be 0, but could end up > 0 if the
+            # final abort is coalesced with another chunk in the output queue.
+            assert len(final_output.outputs[0].token_ids) >= 0
         else:
             # For FINAL_ONLY, we should only get the final output
             assert len(outputs) == 0
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 73726eeab5fc..84ad2299b065 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -998,7 +998,7 @@ async def encode(
             await self.abort(request_id)
             raise
 
-    async def abort(self, request_id: str) -> None:
+    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
         """Abort a request.
 
         Abort a submitted request. If the request is finished or not found,
@@ -1007,6 +1007,9 @@ async def abort(self, request_id: str) -> None:
         Args:
             request_id: The unique id of the request.
         """
+        if not isinstance(request_id, str):
+            raise RuntimeError("Only single-request abort supported in"
+                               " deprecated V0")
         if not self.is_running:
             raise AsyncEngineDeadError(
                 "Background loop is not running. If it was running, "
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index f69f72edf6a5..eca29af50055 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -5,8 +5,8 @@
 import copy
 import pickle
 from contextlib import contextmanager, suppress
-from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
-                    Optional, Union, cast)
+from typing import (Any, AsyncGenerator, Dict, Iterable, Iterator, List,
+                    Mapping, Optional, Union, cast)
 
 import cloudpickle
 import psutil
@@ -404,9 +404,13 @@ async def _wait_for_server_rpc(self, socket: Socket) -> RPCStartupResponse:
             error_message="Unable to start RPC Server",
             socket=socket)
 
-    async def abort(self, request_id: str):
+    async def abort(self, request_id: Union[str, Iterable[str]]):
         """Send an ABORT_REQUEST signal to the RPC Server"""
 
+        if not isinstance(request_id, str):
+            raise RuntimeError("Only single-request abort supported in"
+                               " deprecated V0")
+
         with suppress(MQClientClosedError):
             await self._send_one_way_rpc_request(
                 request=RPCAbortRequest(request_id), socket=self.input_socket)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 671e9648a3d0..c610fb5eae60 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -3,7 +3,7 @@
 
 import asyncio
 from abc import ABC, abstractmethod
-from typing import AsyncGenerator, Mapping, Optional
+from typing import AsyncGenerator, Iterable, Mapping, Optional, Union
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import DecodingConfig, ModelConfig, VllmConfig
@@ -229,11 +229,12 @@ def encode(
         ...
 
     @abstractmethod
-    async def abort(self, request_id: str) -> None:
+    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
         """Abort a request.
 
         Args:
-            request_id: The unique id of the request.
+            request_id: The unique id of the request,
+                        or an iterable of such ids.
         """
         ...
 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 40f41893abb6..64f7426bd65d 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1315,6 +1315,11 @@ def common_broadcastable_dtype(dtypes: Collection[torch.dtype]):
     )
 
 
+def as_list(maybe_list: Iterable[T]) -> list[T]:
+    """Convert iterable to list, unless it's already a list."""
+    return maybe_list if isinstance(maybe_list, list) else list(maybe_list)
+
+
 # `collections` helpers
 def is_list_of(
     value: object,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index edc2e235c3c3..664fec31a4da 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import time
-from collections.abc import AsyncGenerator, Mapping
+from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, Optional, Union
 
@@ -27,7 +27,8 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, cancel_task_threadsafe, cdiv, deprecate_kwargs
+from vllm.utils import (Device, as_list, cancel_task_threadsafe, cdiv,
+                        deprecate_kwargs)
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
@@ -431,14 +432,16 @@ async def output_handler():
 
         self.output_handler = asyncio.create_task(output_handler())
 
-    async def abort(self, request_id: str) -> None:
+    async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
 
-        request_ids = self.output_processor.abort_requests((request_id, ))
-        await self.engine_core.abort_requests_async(request_ids)
+        request_ids = (request_id, ) if isinstance(
+            request_id, str) else as_list(request_id)
+        all_request_ids = self.output_processor.abort_requests(request_ids)
+        await self.engine_core.abort_requests_async(all_request_ids)
 
         if self.log_requests:
-            logger.info("Aborted request %s.", request_id)
+            logger.info("Aborted request(s) %s.", ",".join(request_ids))
 
     async def encode(
         self,

From e6968c3c8b6288a7dc799482c183c4cfc8da51eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 16 Aug 2025 02:14:08 +0200
Subject: [PATCH 121/233] [Kernel] Simplify `get_kv_cache_layout` and cache
 `use_trtllm_attention` env-dependent bit (#22735)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 vllm/utils/flashinfer.py            | 46 +++++++++++++++++++----------
 vllm/v1/attention/backends/utils.py | 18 ++++++-----
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 0d7d4b694f07..2e31b7bad747 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -148,6 +148,31 @@ def has_nvidia_artifactory() -> bool:
         return False
 
 
+@functools.cache
+def supports_trtllm_attention() -> tuple[bool, Optional[str]]:
+    """Cache result which only depends on the environment"""
+    # This is a lambda, call it once
+    env_value = envs.VLLM_USE_TRTLLM_ATTENTION
+
+    # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
+    if not (current_platform.is_device_capability(100)
+            and has_nvidia_artifactory()):
+        return False, env_value
+
+    if env_value is not None:
+        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
+        # Environment variable is set - respect it
+        # Making the conditional check for zero because
+        # the path is automatically enabled if the batch size condition
+        # is satisfied.
+        use_trtllm = (env_value == "1")
+        if use_trtllm:
+            logger.info_once("Using TRTLLM attention.")
+        return use_trtllm, env_value
+
+    return True, None
+
+
 def use_trtllm_attention(
     num_tokens: int,
     max_seq_len: int,
@@ -157,9 +182,8 @@ def use_trtllm_attention(
     attn_head_size: Optional[int],
     has_sinks: bool = False,
 ) -> bool:
-    # Requires SM100 and NVIDIA artifactory to be accessible to download cubins
-    if not (current_platform.is_device_capability(100)
-            and has_nvidia_artifactory()):
+    use_trtllm, env_value = supports_trtllm_attention()
+    if not use_trtllm:
         return False
 
     # Check if the dimensions are supported by TRTLLM decode attention
@@ -174,18 +198,7 @@ def use_trtllm_attention(
             "Using TRTLLM attention (required for attention sinks).")
         return True
 
-    env_value = envs.VLLM_USE_TRTLLM_ATTENTION
-    if env_value is not None:
-        logger.info_once("VLLM_USE_TRTLLM_ATTENTION is set to %s", env_value)
-        # Environment variable is set - respect it
-        # Making the conditional check for zero because
-        # the path is automatically enabled if the batch size condition
-        # is satisfied.
-        use_trtllm = (env_value == "1")
-        if use_trtllm:
-            logger.info_once("Using TRTLLM attention.")
-        return use_trtllm
-    else:
+    if env_value is None:
         # Environment variable not set - use auto-detection
         use_trtllm = (num_tokens <= 256 and max_seq_len < 131072
                       and kv_cache_dtype == "auto")
@@ -193,6 +206,9 @@ def use_trtllm_attention(
             logger.warning_once("Using TRTLLM attention (auto-detected).")
         return use_trtllm
 
+    # Environment variable is set to 1 - respect it
+    return True
+
 
 if has_flashinfer():
 
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 1c7d08798964..5e6bc331835b 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -248,19 +248,23 @@ def use_cascade_attention(
 
 @functools.lru_cache
 def get_kv_cache_layout():
+    # Format specified by the code.
     global _KV_CACHE_LAYOUT_OVERRIDE
-    # Override with format specified by the user.
+
+    if _KV_CACHE_LAYOUT_OVERRIDE is not None:
+        cache_layout = _KV_CACHE_LAYOUT_OVERRIDE
+        logger.info_once("`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. " \
+                         "Setting KV cache layout to %s.", cache_layout)
+        return cache_layout
+
+    # Format specified by the user.
     cache_layout = envs.VLLM_KV_CACHE_LAYOUT
+    # When neither the user nor the override specified a layout, get default
     if cache_layout is None:
-        if envs.VLLM_USE_TRTLLM_ATTENTION:
-            cache_layout = "HND"
-        else:
-            cache_layout = get_kv_connector_cache_layout()
+        cache_layout = get_kv_connector_cache_layout()
     else:
         logger.info_once("`VLLM_KV_CACHE_LAYOUT` environment variable " \
         "detected. Setting KV cache layout to %s.", cache_layout)
-    if _KV_CACHE_LAYOUT_OVERRIDE is not None:
-        cache_layout = _KV_CACHE_LAYOUT_OVERRIDE
     return cache_layout
 
 
From d0dd871f4c9fca5becf121c8c509151570b2b9ff Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Fri, 15 Aug 2025 21:25:06 -0400
Subject: [PATCH 122/233] [Bugfix] Fix DeepSeek MTP (#22934)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
---
 vllm/model_executor/models/deepseek_mtp.py | 13 +++++++------
 vllm/model_executor/models/glm4_moe_mtp.py |  7 +++----
 vllm/model_executor/models/mimo_mtp.py     |  7 +++----
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 2e026d582a6d..0ad001be71c1 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -158,14 +158,13 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        previous_hidden_states: torch.Tensor,
+        hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions,
-                                   previous_hidden_states, inputs_embeds,
-                                   spec_step_idx)
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
         return hidden_states
 
     def compute_logits(
@@ -213,13 +212,15 @@ def load_weights(self, weights: Iterable[tuple[str,
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if (("mlp.experts." in name) and name not in params_dict):
                     continue
-                name = name.replace(weight_name, param_name)
+                name_mapped = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
                 # weight loading if it's not enabled
                 if ((param_name == "fused_qkv_a_proj")
-                        and name not in params_dict):
+                        and name_mapped not in params_dict):
                     continue
+                else:
+                    name = name_mapped
 
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index 0624640054d1..322c5619c178 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -180,14 +180,13 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        previous_hidden_states: torch.Tensor,
+        hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions,
-                                   previous_hidden_states, inputs_embeds,
-                                   spec_step_idx)
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index 19afc5be3fb8..5a2079bf5121 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -164,15 +164,14 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        previous_hidden_states: torch.Tensor,
+        hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         assert spec_step_idx == 0, "mimo_mtp only support predict one token now"
-        hidden_states = self.model(input_ids, positions,
-                                   previous_hidden_states, inputs_embeds,
-                                   spec_step_idx)
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
         return hidden_states
 
     def compute_logits(

From e161da7faf8d5f9d1376d391160b67f9fa19efe4 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 15 Aug 2025 19:06:30 -0700
Subject: [PATCH 123/233] [Frontend] Avoid list copies in `serving_chat.py`
 (#22947)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/serving_chat.py | 29 +++++++++++++------------
 vllm/reasoning/abs_reasoning_parsers.py |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index b4231c6d10c4..12349234c320 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -50,6 +50,7 @@
 from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
                                                 truncate_tool_call_ids,
                                                 validate_request_params)
+from vllm.utils import as_list
 
 logger = init_logger(__name__)
 
@@ -670,10 +671,10 @@ async def chat_completion_stream_generator(
 
                         # avoid the None + list error.
                         if previous_token_ids:
-                            current_token_ids = previous_token_ids + list(
+                            current_token_ids = previous_token_ids + as_list(
                                 output.token_ids)
                         else:
-                            current_token_ids = list(output.token_ids)
+                            current_token_ids = as_list(output.token_ids)
 
                     if self.use_harmony:
                         if is_final:
@@ -703,11 +704,10 @@ async def chat_completion_stream_generator(
                             # set reasoning status to end.
                             # Only keep 'content', remove 'reasoning_content'.
                             if reasoning_parser.is_reasoning_end(
-                                    list(output.token_ids)) or \
-                                    (res.prompt_token_ids and
-                                        reasoning_parser.is_reasoning_end(
-                                            list(res.prompt_token_ids)
-                                        )):
+                                    as_list(output.token_ids)) or (
+                                        res.prompt_token_ids
+                                        and reasoning_parser.is_reasoning_end(
+                                            res.prompt_token_ids)):
                                 reasoning_end_arr[i] = True
                                 if delta_message and delta_message.content:
                                     # This need to be added to next `delta_text`
@@ -771,6 +771,7 @@ async def chat_completion_stream_generator(
                         assert reasoning_parser is not None
                         assert added_content_delta_arr is not None
                         assert reasoning_end_arr is not None
+                        output_token_ids = as_list(output.token_ids)
                         if not reasoning_end_arr[i]:
                             delta_message = (
                                 reasoning_parser.
@@ -780,7 +781,7 @@ async def chat_completion_stream_generator(
                                     delta_text,
                                     previous_token_ids,
                                     current_token_ids,
-                                    output.token_ids,
+                                    output_token_ids,
                                 ))
                             # When encountering think end id in prompt_token_ids
                             # i.e {"enable_thinking": False},
@@ -789,9 +790,9 @@ async def chat_completion_stream_generator(
                             # to 'reasoning_content'.
                             if res.prompt_token_ids and \
                                 reasoning_parser.is_reasoning_end(
-                                    list(res.prompt_token_ids)):
+                                    res.prompt_token_ids):
                                 reasoning_end_arr[i] = True
-                                current_token_ids = list(output.token_ids)
+                                current_token_ids = output_token_ids
                                 if delta_message and delta_message.content:
                                     current_text = delta_message.content
                                     delta_message.content = None
@@ -802,11 +803,11 @@ async def chat_completion_stream_generator(
                             # Remove the text and token ids related
                             # to 'reasoning_content'.
                             if reasoning_parser.is_reasoning_end(
-                                    list(output.token_ids)):
+                                    output_token_ids):
                                 reasoning_end_arr[i] = True
                                 current_token_ids =  \
                                     reasoning_parser.extract_content_ids(
-                                        list(output.token_ids))
+                                        output_token_ids)
                                 if delta_message and delta_message.content:
                                     current_text = delta_message.content
                                     delta_message.content = None
@@ -815,7 +816,7 @@ async def chat_completion_stream_generator(
 
                         # handle tool calls only after reasoning is done,
                         else:
-                            delta_token_ids = list(output.token_ids)
+                            delta_token_ids = output_token_ids
                             # First time to tool call,
                             # add the remaining text and token ids
                             # to delta from previous
@@ -899,7 +900,7 @@ async def chat_completion_stream_generator(
                             self.request_logger.log_outputs(
                                 request_id=request_id,
                                 outputs=delta_content,
-                                output_token_ids=list(output.token_ids),
+                                output_token_ids=as_list(output.token_ids),
                                 finish_reason=output.finish_reason,
                                 is_streaming=True,
                                 delta=True,
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 4f4522d726e8..df9e84163f16 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -44,7 +44,7 @@ def vocab(self) -> dict[str, int]:
         return self.model_tokenizer.get_vocab()
 
     @abstractmethod
-    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
         """
         Check if the reasoning content ends in the input_ids.
 

From b385101d9ccd7a4cf0f94c652936b37871d6bbf4 Mon Sep 17 00:00:00 2001
From: Calvin Chen <wen.chen@dynamia.ai>
Date: Sat, 16 Aug 2025 10:28:10 +0800
Subject: [PATCH 124/233] [V1] support min_tokens for detokener (#22014)

Signed-off-by: calvin chen <wen.chen@dynamia.ai>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 tests/detokenizer/test_min_tokens.py | 50 ++++++++++++++++++++++++++++
 vllm/v1/engine/detokenizer.py        | 11 ++++--
 2 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 tests/detokenizer/test_min_tokens.py

diff --git a/tests/detokenizer/test_min_tokens.py b/tests/detokenizer/test_min_tokens.py
new file mode 100644
index 000000000000..887e83342536
--- /dev/null
+++ b/tests/detokenizer/test_min_tokens.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm import SamplingParams
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.detokenizer import FastIncrementalDetokenizer
+
+PROMPT = "Hello, my name is Lee, and I'm a student in the " + \
+         "college of engineering"
+
+
+@pytest.mark.parametrize("min_tokens,stop,truth", [
+    (0, None, " is Lee, and I'm a student in the college of engineering"),
+    (0, "e", " is L"),
+    (5, "e", " is Lee, and I'm a stud"),
+])
+def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str):
+    """Test for a specific min_tokens and stop.
+
+    See https://github.com/vllm-project/vllm/pull/22014
+    """
+    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+    all_prompt_ids = tokenizer(PROMPT, add_special_tokens=False).input_ids
+
+    # The prompt is "Hello, my name is"
+    prompt_token_ids = all_prompt_ids[:4]
+    params = SamplingParams(
+        stop=stop,
+        min_tokens=min_tokens,
+    )
+    request = EngineCoreRequest("",
+                                prompt_token_ids,
+                                None,
+                                None,
+                                None,
+                                params,
+                                None,
+                                None,
+                                0.0,
+                                None,
+                                cache_salt=None,
+                                data_parallel_rank=None)
+
+    detokenizer = FastIncrementalDetokenizer(tokenizer, request)
+
+    detokenizer.update(all_prompt_ids[4:], False)
+    assert detokenizer.output_text == truth
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 2f5504ea14b4..04ad51aae0a8 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -74,6 +74,7 @@ def __init__(self, request: EngineCoreRequest):
         params = request.sampling_params
         assert params is not None
         self.stop = stop = params.stop
+        self.min_tokens = params.min_tokens
         self.include_stop_str_in_output = params.include_stop_str_in_output
 
         # Number of chars to hold back when stop strings are to be excluded
@@ -111,10 +112,14 @@ def update(self, new_token_ids: list[int],
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
-        offset_before = len(self.output_text)
+        stop_check_offset = len(self.output_text)
         for new_token_id in new_token_ids:
             self.token_ids.append(new_token_id)
             self.output_text += self.decode_next(new_token_id)
+            # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014
+            if self.min_tokens and len(
+                    self.output_token_ids) <= self.min_tokens:
+                stop_check_offset = len(self.output_text)
 
         if stop_terminated:
             if skipped_stop_token_id is not None:
@@ -125,10 +130,10 @@ def update(self, new_token_ids: list[int],
 
         # 2) Evaluate stop strings.
         stop_string = None
-        if self.stop:
+        if self.stop and len(self.output_token_ids) > self.min_tokens:
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
-                new_char_count=len(self.output_text) - offset_before,
+                new_char_count=len(self.output_text) - stop_check_offset,
                 stop=self.stop,
                 include_in_output=self.include_stop_str_in_output,
             )

From 2181fd83c58a7809e6da9ba3d3ce077be3b235b5 Mon Sep 17 00:00:00 2001
From: Grace Ho <146482179+gracehonv@users.noreply.github.com>
Date: Fri, 15 Aug 2025 19:52:51 -0700
Subject: [PATCH 125/233] [misc] nsys profile output kernel classifier and
 visualizer (#22971)

Signed-off-by: Grace Ho <grho@nvidia.com>
---
 tools/profiler/nsys_profile_tools/README.md   | 175 +++++++
 .../nsys_profile_tools/gputrc2graph.py        | 426 ++++++++++++++++++
 .../nsys_profile_tools/images/csv1.png        | Bin 0 -> 148416 bytes
 .../nsys_profile_tools/images/html.png        | Bin 0 -> 72163 bytes
 .../nsys_profile_tools/images/html_tbl.png    | Bin 0 -> 36615 bytes
 5 files changed, 601 insertions(+)
 create mode 100644 tools/profiler/nsys_profile_tools/README.md
 create mode 100755 tools/profiler/nsys_profile_tools/gputrc2graph.py
 create mode 100644 tools/profiler/nsys_profile_tools/images/csv1.png
 create mode 100644 tools/profiler/nsys_profile_tools/images/html.png
 create mode 100644 tools/profiler/nsys_profile_tools/images/html_tbl.png

diff --git a/tools/profiler/nsys_profile_tools/README.md b/tools/profiler/nsys_profile_tools/README.md
new file mode 100644
index 000000000000..75ae0811cc54
--- /dev/null
+++ b/tools/profiler/nsys_profile_tools/README.md
@@ -0,0 +1,175 @@
+# gputrc2graph.py
+
+This script processes NVIDIA Nsight Systems (`nsys`) GPU trace files
+(`.nsys-rep`) with -t cuda tracing enabled, and generates kernel-level
+summaries and visualizations of GPU and non-GPU time. It is useful for
+profiling and analyzing nsys profile output.
+
+## Usage
+
+### Command-line Arguments
+
+- `--in_file`  
+  **(required)**  
+  List of input files and their metadata. Each entry should be in the format:  
+  `<nsys-rep>,<engine>,<model>,<elapsed_nonprofiled_sec>`  
+    - `nsys-rep`: Path to the `.nsys-rep` file.
+    - `engine`: Engine name (e.g., `vllm`).
+    - `model`: Model name (e.g., `llama`, `gpt-oss`, `ds`).
+    - `elapsed_nonprofiled_sec`: Wall-clock runtime (in seconds) without
+    profiling. Specify `0` to use the elapsed time from the nsys-rep file
+    (this may inflate non-GPU time if actual runtime without profiling is
+    less). Multiple entries can be provided, separated by spaces.
+
+- `--out_dir`  
+  Output directory for the generated CSV and HTML files.  
+  If not specified, results are saved in the current directory.
+
+- `--title`  
+  Title for the HTML chart/visualization.
+
+- `--nsys_cmd`  
+  Path to the `nsys` command.  
+  Default: `nsys` (assumes it is in your PATH).  
+  Use this if `nsys` is not in your system PATH.
+
+## Notes
+
+- Make sure you have pandas installed.
+- Make sure nsys is installed, and specify the path to the `nsys` command with
+  `--nsys_cmd` if it is not in your PATH.
+- For more details on available engines and models, see the help string in
+  the script or run:
+
+```bash
+python3 gputrc2graph.py --help
+```
+
+## Example 1: analyze a single profile
+
+To analyze the GPU cycles for say, gpt-oss model with vLLM engine:
+
+1. Run the following command to collect nsys profile, for vllm serve config.
+
+   ```bash
+   nsys profile -t cuda -o run1 -f true --trace-fork-before-exec=true \
+   --cuda-graph-trace=node --delay <DELAY> --duration <DURATION> \
+   vllm serve openai/gpt-oss-120b ...
+   ```
+
+   where:
+
+   - DELAY: how many seconds to delay nsys from collecting profiles, needed so
+     that profiles aren't captured till vllm server has come up and load
+     generation starts.
+   - DURATION: how many seconds for nsys profile to run before generating the
+     profile. This should be > the duration of the run.
+
+2. Run again, this time without collecting the profile, and get the total run
+   time in seconds. This value will be used by the script to calculate the
+   CPU(non-GPU) seconds for the analysis.
+
+3. Say the run elapsed time is 306 seconds, from step #2. Run script to
+   analyze:
+
+   ```bash
+   python3 gputrc2graph.py \
+   --in_file run1.nsys-rep,vllm,gpt-oss,306 \
+   --title "vLLM-gpt-oss profile"
+   ```
+
+The command will produce 2 files for analysis:
+
+- result.html: this categorizes kernel names into different categories in a
+  stacked bar chart.
+- result.csv: shows how the kernel names are mapped to the different
+  categories.
+
+### HTML visualization with result.html
+
+The html file shows the number of elapsed seconds due to different GPU
+Substages or categories, which consist of moe_gemm (Mixture of Experts GEMM)
+kernels the biggest category, at 148 seconds, followed by "attn" or attention
+kernels. This lets the user prioritize the kernels to focus on for performance
+optimizations.
+
+![Example GPU Trace Visualization](images/html.png)
+
+There's also an appended data table underneath the bar chart for copying out to other post-processing tools.
+
+![Example GPU Trace Table](images/html_tbl.png)
+
+### Kernel to category mapping with result.csv
+
+Suppose the user would like to focus on improving triton kernels. It's not the
+biggest consumer of cycles at 9.74 sec but perhaps it hasn't been optimized.
+The next step is to use the result.csv to dive into what the kernels are which
+compose the triton kernel GPU cycles. The following image shows that
+triton_poi_fused__to_copy_add_addmm_cat_.. kernel to be the biggest
+contributor to GPU cycles.
+
+![Example GPU Trace csv](images/csv1.png)
+
+## Example 2: analyze multiple profiles
+
+Suppose the user has multiple nsys trace files, captured for different models,
+say llama and gpt-oss in this case, and wish to compare their GPU/non-GPU
+time, something like the following command can be used.
+
+```bash
+python3 gputrc2graph.py \
+--in_file run1.nsys-rep,vllm,llama,100 run2.nsys-rep,vllm,gpt-oss,102 \
+--out_dir results \
+--title "Comparison of vLLM Models"
+```
+
+The analysis process is similar to example 1 but now there will be multiple
+stack bar charts that can be compared.  The categories for the different
+kernels will remain the same, so that it's easy to compare the GPU cycles for
+the same categories.
+
+Once a category is shown to have more cycles for one configuration than
+another, the next step would be to use the csv file to see what kernels are
+mapped into that category, and which kernels are taking the largest amount of
+time which would cause a difference for the overall category.
+
+## Example 3: add new classification for a new model
+
+Suppose there's a new model ABC that is available for engine DEF, and say there
+are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
+have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
+or "*K*" in them, add a new entry like so:
+
+```python
+engine_model = {
+        'DEF': {
+            'ABC': { 
+                'layer_anno': {
+                    'Stage': {
+                        '.*': 'layer',
+                    },
+                    'Substage': {
+                        'H|I': 'gemm',
+                        'J|K': 'attn',
+                        'CUDA mem': 'non-gpu-H_D_memops',
+                        '.*': 'misc'
+                    }
+                }
+            },
+        }
+      'vllm': {...}
+```
+
+Basically Substage is a dictionary with a list of key/value pairs, where the
+keys are regex's of the kernel names to be classified, and values are the
+classification bins which one wishes to compare across engines/models.
+
+The last 2 entries are common for all engine/models, consisting of CUDA memory
+operations and a 'misc' for anything that's leftover and can't be classified.
+
+When invoking gputrc2graph.py, specify a trace file with this new model/engine
+like the following:
+
+```bash
+--infile new.nsys-rep,DEF,ABC,<runtime>
+```
diff --git a/tools/profiler/nsys_profile_tools/gputrc2graph.py b/tools/profiler/nsys_profile_tools/gputrc2graph.py
new file mode 100755
index 000000000000..8921e1f20f3d
--- /dev/null
+++ b/tools/profiler/nsys_profile_tools/gputrc2graph.py
@@ -0,0 +1,426 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+    This generates gpu kernel analysis output from nsys rep. Will call nsys
+    stats  -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate
+    csv and html output for analysis
+"""
+import argparse
+import logging
+import os
+
+import regex as re
+
+logger = logging.getLogger(__name__)
+
+
+# helper data class for annotating kernels
+class EngineModelData:
+    # engine + model mappings
+    engine_model = {
+        'vllm': {
+            'llama': {
+                'layer_anno': {
+                    'Stage': {
+                        '.*': 'layer',
+                    },
+                    'Substage': {
+                        'gemm': 'gemm',
+                        'fused_moe_kernel|GroupProblemShape|group_gemm_starts':
+                        'moe_gemm',  #llama4
+                        'moe|sigmoid': 'moe',  #llama4
+                        'CatArrayBatched|prepare_inputs': 'prepare_next',
+                        'flash': 'attn',
+                        'ncclDevKernel|cross_device_reduce':
+                        'nccl_and_custom_ar',
+                        '_norm_': 'norm',
+                        'act_and_mul_': 'silu',
+                        'rotary_embedding_kernel': 'rope',
+                        'SoftMax': 'softmax',
+                        'elementwise': 'elementwise',
+                        'fp8_quant': 'quantize',
+                        'reduce_kernel': 'reduce',
+                        'triton': 'triton_kernel',
+                        'CUDA mem': 'non-gpu-H_D_memops',
+                        '.*': 'misc'
+                    }
+                }
+            },
+            'ds': {
+                'layer_anno': {
+                    'Stage': {
+                        '.*': 'layer',
+                    },
+                    'Substage': {
+                        'block_fp8|gemm_fp8_blockwise':
+                        'block_fp8_gemm',
+                        'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal':
+                        'moe_gemm',
+                        'gemm|matmul|nvjet':
+                        'gemm',
+                        'moe|sigmoid|expert':
+                        'moe',
+                        '_fwd_|FlashAttn|_mla_|_attn_':
+                        'attn',
+                        'CatArrayBatched':
+                        'prepare_next',
+                        'ncclDevKernel|cross_device_reduce':
+                        'nccl_and_custom_ar',
+                        'Norm|_norm_':
+                        'norm',
+                        'sbtopk':
+                        'topk',
+                        'act_and_mul_':
+                        'activation',
+                        'compute_position_kernel':
+                        'rope',
+                        'elementwise':
+                        'elementwise',
+                        'fp8_quant|quant_fp8|cvt_fp16_to_fp4':
+                        'quantize',
+                        'reduce':
+                        'reduce',
+                        'SoftMax':
+                        'softmax',
+                        'triton':
+                        'triton_kernel',
+                        'CUDA mem':
+                        'non-gpu-H_D_memops',
+                        '.*':
+                        'misc'
+                    }
+                }
+            },
+            'gpt-oss': {
+                'layer_anno': {
+                    'Stage': {
+                        '.*': 'layer',
+                    },
+                    'Substage': {
+                        'block_fp8|gemm_fp8_blockwise':
+                        'block_fp8_gemm',
+                        'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_'
+                        # this section is triton_moe_gemm
+                        '|matmul_ogs_|_topk_forward|_combined_routing'
+                        '|_sum_bitmatrix_rows|_compute_writeback_idx':
+                        'moe_gemm',
+                        'gemm|matmul|nvjet':
+                        'gemm',
+                        'moe|sigmoid|expert|splitKreduce':
+                        'moe',
+                        '_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha':
+                        'attn',
+                        'CatArrayBatched':
+                        'prepare_next',
+                        'ncclDevKernel|cross_device_reduce':
+                        'nccl_and_custom_ar',
+                        'Norm|_norm_':
+                        'norm',
+                        'sbtopk':
+                        'topk',
+                        'act_and_mul_':
+                        'activation',
+                        'compute_position_kernel':
+                        'rope',
+                        'elementwise':
+                        'elementwise',
+                        'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize':
+                        'quantize',
+                        'reduce':
+                        'reduce',
+                        'SoftMax':
+                        'softmax',
+                        'triton':
+                        'triton_kernel',
+                        'CUDA mem':
+                        'non-gpu-H_D_memops',
+                        '.*':
+                        'misc'
+                    }
+                }
+            }
+        },
+    }
+
+
+class GPUTrace2Graph:
+    """ 
+        Parses output of nsys report, generates csv and bar chart output
+    """
+
+    def __init__(self, nsys_cmd):
+        self.nsys_cmd = nsys_cmd
+        import pandas as pd  # avoid importing till needed
+        self.pd = pd
+        self.pd.options.mode.copy_on_write = True
+
+    # helper functions for generating trace->summary csvs
+    def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file):
+        logger.info('loading %s', in_file)
+        df = self.pd.read_csv(
+            in_file,
+            usecols=['Start (ns)', 'Duration (ns)', 'Device', 'Strm', 'Name'])
+        df['End (ns)'] = df['Start (ns)'] + df['Duration (ns)']
+        df = self.sum_non_overlapping_intervals(df)
+        # get ready to print table with elapsed times per kernel
+        df['Instances'] = 1
+        df_sum = df.groupby('Name', as_index=False).agg({
+            'Elapsed Time (ns)': 'sum',
+            'Duration (ns)': 'sum',
+            'Instances': 'size'
+        })
+
+        # generate csv
+        df_sum['Total Time (sec)'] = df_sum['Duration (ns)'] / 1e9
+        df_sum['Elapsed Time (sec)'] = df_sum['Elapsed Time (ns)'] / 1e9
+        df_sum = df_sum.sort_values(by='Elapsed Time (sec)', ascending=False)
+        df_sum[['Elapsed Time (sec)', 'Total Time (sec)', 'Instances',
+                'Name']].to_csv(out_file, index=False)
+
+    def sum_non_overlapping_intervals(self, df):
+        """ 
+            returns new sorted df with Elapsed Time (ns) column using 
+            vectorized operations 
+        """
+        logger.info("sorting %s trace records by start time", str(df.shape))
+
+        # Sort by start time and reset index
+        df = df.sort_values(by='Start (ns)').reset_index(drop=True)
+
+        # Initialize elapsed time as duration
+        df['Elapsed Time (ns)'] = df['Duration (ns)']
+
+        # Get numpy arrays for faster operations
+        starts = df['Start (ns)'].values
+        ends = df['End (ns)'].values
+
+        # Keep track of current interval end
+        current_end = ends[0]
+        display_units = int(len(df) / 100)
+        # Update current_end for overlapping intervals
+        for i in range(1, len(df)):
+            if i % display_units == 0:
+                print(f'processing trace: {int(i/len(df) * 100)} %', end="\r")
+            if starts[i] <= current_end:
+                if ends[i] > current_end:
+                    # Partial overlap
+                    df.iloc[i, df.columns.get_loc('Elapsed Time (ns)'
+                                                  )] = ends[i] - current_end
+                    current_end = ends[i]
+                else:
+                    # Complete overlap
+                    df.iloc[i, df.columns.get_loc('Elapsed Time (ns)')] = 0
+            else:
+                # No overlap
+                current_end = ends[i]
+
+        return df
+
+    # functions for generating html files
+    def make_html(self, df, output_dir, title):
+        """ make html graph from df """
+        import plotly.express as px
+        if df.empty:
+            return
+        output_name = output_dir + '/result'
+        if not title:
+            title = 'Model_Engine'
+        x = 'Model_Engine'
+        y = 'Elapsed Time (sec)'
+        color = 'Substage'
+        """ generate kernel mapping table  """
+        # Sort Model_Engine categories by last field after underscore
+        df['Model_Engine'] = self.pd.Categorical(
+            df['Model_Engine'],
+            sorted(df['Model_Engine'].unique(),
+                   key=lambda x: x.split('_')[-1]))
+        df[['Model_Engine', color, 'Instances', 'Name',
+            y]].sort_values(by=color).to_csv(f'{output_name}.csv', index=False)
+        graph = px.histogram(df.round(2),
+                             x=x,
+                             y=y,
+                             title=(f'{y} for {title}'),
+                             color=color,
+                             text_auto=True)
+        # wrap x axis labels
+        graph.update_xaxes(automargin=True)
+        graph.write_html(f'{output_name}.html')
+        """
+            Generate data table with columns per Model_Engine into result.html
+        """
+        pivot_df = df.pivot_table(values='Elapsed Time (sec)',
+                                  index='Substage',
+                                  columns='Model_Engine',
+                                  aggfunc='sum',
+                                  observed=False).round(2)
+        # Add sum row at bottom
+        pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
+        pivot_df.fillna('').to_html('temp.html')
+        print('got')
+        with (open(f'{output_name}.html', 'a', encoding='utf-8') as
+              outfile, open('temp.html', encoding='utf-8') as infile):
+            outfile.write(infile.read())
+        os.remove('temp.html')
+
+        print(f'Finished generating: \n'
+              f' {output_name}.html for stack bar chart \n'
+              f' {output_name}.csv for Kernel-Substage mapping')
+
+    def anno_gpu_kernname(self, df, mapping):
+        """ add "stage" and "substage" columns """
+
+        def anno_gpu_kernname_helper(name, stage):
+            for kern_name, val in mapping['layer_anno'][stage].items():
+                if re.search(kern_name, name):
+                    return val
+
+        for stage in ['Stage', 'Substage']:
+            df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage)
+
+    def make_nongpu_row(self, df, nongpu_sec):
+        """ this will append non-gpu time entry at end of df """
+        nongpu_row = self.pd.DataFrame([df.iloc[-1]])
+        nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)'
+        nongpu_row['Instances'] = 1
+        nongpu_row['Elapsed Time (sec)'] = nongpu_sec
+        return (nongpu_row)
+
+    def is_valid_file(self, base_file):
+        """ asserts if base_file is non-existent or is empty """
+        assert os.path.isfile(base_file) and os.path.getsize(base_file) > 0, \
+           f"{base_file} doesn't exist or is empty"
+
+    def should_gen_file(self, new_file, base_file):
+        """ figure out if new file should be generated from base_file """
+        self.is_valid_file(base_file)
+        if (os.path.exists(new_file)
+                and (os.path.getmtime(new_file) > os.path.getmtime(base_file))
+                and (os.path.getsize(base_file) > 0)):
+            logger.info('reusing %s', new_file)
+            return False
+        else:
+            logger.info('generating %s', new_file)
+            return True
+
+    def gen_sum_file(self, file):
+        """ 
+            generates sum file from nsys trace with times per kernel and
+            returns the name of the sum file
+        """
+        import subprocess
+        file_dir = os.path.dirname(file)
+        file_name = os.path.basename(file)
+
+        if not file_dir:
+            file_dir = '.'
+        # Walk through trace and get the total non-overlapped time
+        nsys_stats_file = f'{file_dir}/{file_name}_cuda_gpu_trace.csv'
+        sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
+        if self.should_gen_file(nsys_stats_file, file):
+            cmd = [
+                self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
+                f'{file_dir}/{file_name}'
+            ]
+            cmd_str = ' '.join(cmd)
+            logger.info('+ %s', cmd_str)
+            try:
+                subprocess.run(cmd)
+            except Exception:
+                logger.error(
+                    "%s failed, specify --nsys_cmd for correct nsys path",
+                    cmd_str)
+                exit(1)
+            logger.info('generating non-overalapped sum %s', sum_file)
+            self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
+        self.is_valid_file(sum_file)
+        logger.info('Finished generating %s', sum_file)
+        return sum_file
+
+    def gen_graph(self, in_file, out_dir, title):
+        """ generates graph and csv file from in_file into out_dir """
+        # Initialize an empty DataFrame to store combined data
+        combined_df = self.pd.DataFrame()
+        for idx, (file, engine, model, total_sec) in enumerate(in_file):
+            file_dir = os.path.dirname(file)
+            file_name = os.path.basename(file)
+            if not file_dir:
+                file_dir = '.'
+            sum_file = self.gen_sum_file(file)
+            # read kernel summary file
+            df = self.pd.read_csv(sum_file)
+            # annotate kernel to their categories
+            assert EngineModelData.engine_model.get(engine)
+            assert EngineModelData.engine_model[engine].get(model)
+            # remove nsys-rep from file_name for shorter x-label
+            file_name = file_name.replace('.nsys-rep', '')
+            df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
+            self.anno_gpu_kernname(df,
+                                   EngineModelData.engine_model[engine][model])
+            # patch in non-gpu time
+            gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
+            total_sec = round(float(total_sec), 1)
+            if total_sec < gpu_sec:
+                logger.warning(
+                    "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ",
+                    total_sec,
+                    gpu_sec,
+                )
+                total_sec = gpu_sec
+            nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec)
+            df = self.pd.concat([df, nongpu_row], ignore_index=True)
+            combined_df = self.pd.concat([combined_df, df], ignore_index=True)
+        if out_dir is None:
+            out_dir = '.'
+        else:
+            os.makedirs(out_dir, exist_ok=True)
+        # generate html file
+        self.make_html(combined_df, out_dir, title)
+
+
+def parse_tuple(s):
+    return tuple(s.split(','))
+
+
+def main():
+    logging.basicConfig(format=('%(asctime)s - %(levelname)s - %(message)s'),
+                        level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        description=(
+            'Process nsys rep and generate kernel non-overlapped cycles. \n'
+            'Example:\n'
+            "gputrc2graph.py --in_file d1.nsys-rep,vllm,llama,100 \n"
+            "d2.nsys-rep,vllm,gpt-oss,102 "
+            "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    # Build help string showing available engine/model combinations
+    engine_model_help = []
+    for engine, models in EngineModelData.engine_model.items():
+        model_list = list(models.keys())
+        engine_model_help.append(f"{engine}:[{','.join(model_list)}]")
+    engine_model_str = ' '.join(engine_model_help)
+    parser.add_argument(
+        '--in_file',
+        type=parse_tuple,
+        nargs='+',
+        help=(
+            'list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) '
+            'separated by space. Elapsed_nonprofiled_sec is runtime without '
+            'profiling used to calculate non-gpu time. Specify 0 to use '
+            'elapsed time from nsys-rep but that might inflate non-gpu time. '
+            f'Available engine:[model] are: {engine_model_str} '
+            f'Example: --infile d1.nsys-rep,vllm,llama,100 '
+            'd2.nsys-rep,vllm,gpt-oss,102'),
+        required=True)
+    parser.add_argument('--out_dir', help=('output dir for result.csv/html'))
+    parser.add_argument('--title', help=('title for html chart'))
+    parser.add_argument('--nsys_cmd',
+                        help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
+                        default="nsys")
+    args = parser.parse_args()
+    gputrace = GPUTrace2Graph(args.nsys_cmd)
+    gputrace.gen_graph(args.in_file, args.out_dir, args.title)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/profiler/nsys_profile_tools/images/csv1.png b/tools/profiler/nsys_profile_tools/images/csv1.png
new file mode 100644
index 0000000000000000000000000000000000000000..bdeb47c3c2a3575c200ae8dee23bd14ca6dc491b
GIT binary patch
literal 148416
zcmeFYbyVET5<iH0@F2k*f<tf*5Zv7*4DRk4oZ#-k9fG?%1cC*3ch^CF!@c*tcb{a>
z{<Y`qp5^q+d};6M>guZc)HhUKR_qPJI|MK=us0IoB8p&O&=_D~5J_+_pd7TOWiv1^
zgc>tpVR;E*VPbiE8)Gv|02r8fXhIUKywX0_K!=A7H$45fhM#FySb~y!;CU{HU&sZi
znV}#;2sE_2>at1why5$Us2i~ae?f#8YGGnlF@_XT9~ufN)15*H+ur{0uvr;O=jyoM
zUAoLTO#=dC!Ls+h2X;HEWBDQp)l-ko1`uN4i48=7@e2AP*@AIf$|gqRqNBsfr+xKl
zxb*@{4Qm)JFn<ExzRYYjw$dSk$-@nYF44)t3{xq*45NirgA2;TG&jw6)3gV$$b4ZB
zx!A*`l4CV4L8iA0GMnN;GsvXI0~aK&na?o-8>}yq9bYHUbs9vY)<S6X1RoBhZZS5Z
z+s;|?t42#D6LBlyCp&N99`Y$op)MXWrjl|sW}NzTwqXK4H9zt^zInHr7{idM(}U_{
zA2!H{^tEI!g-=539kGFsq5K)$Wz*FCR66NRcsf8f(PW9vBtN*uIA!DpA$nQ@1tEQ#
zygiCMIO|7PGC()c<&G{qF-YQ<47wYONeX@Hp-?&&&4iLGoy5~)e8`^0*t!_Bgu4_X
zO7h5g{2r-YvJoS%U**73Gk>hZ;NXMxsb}KW{82odzz|o}?!zv8vIAR<A$gmfo9>DO
zS4^y@L4B|hy@7iOc~;U1?L>`zH^sPdF5~V}4M{AkGyoVp%5ny=LX7(KQwXjKyuS0@
zwW+eOOY)DAk<5kYUBz`4GyyJ(v@sDIf~YZc+tJ;La{BUPu;k_**2=*}o+%d$Me<QI
zTqZE~0eN-K!gB;L&dYagkvuye+T_8EHo#5O#?iv<jcfN?f}qK-8vJ0CeJOuJf9ga!
z9w+34bg32gK}8}Jq*I6BB!)HuLmPyq?bMWqlI+B)f>ic>Uxk7XA=YL06Kc$t-4<RO
zl0%5G5e24`$rZl8(}M|K2$e8LP+1&$(*KG?Y4Dw)U_~?sCS0J<cX^0-BzR)xS_n<p
z1i^fHXyrFEA3Hu4eq-N5n0~Vy2=fhX@68qbCw&&_05*NdISAuUX-=?6U+kVCTgr44
z?C!KR`$iak2##*6Gqy%-LSMb^O*p|kR8bK;W?`HVD)fk65{VEvjX+E?fcP85s9NHT
zL3$%FTv2{QTb%bnk<2}upC6fGaF8i6Y7lDBl>E59nDsj9TN}VIe@?DcM^GiO2y^PL
zu40}uvSg@-(+cAA<qK>JIqpOLyt--2DVk0)2EH2{-j~0TxXHe8v-ZxmywQ3I)fzu)
zpma^<oZ(KN7cc88WA|w{QCHT$?OO0T5u73vy+2!5Fsc|TDgqqfOV1aFFNEE!-Q)_1
zMRJa$Fc?j}fZiXyPrX<hZ#J0hV#&o=$eW@`#f$Um^6VA)_DT1__siqs<0Ip9m5VU-
z%)cr(O*0pdC<Z7J4P6*vR8zU3X-Kb#@QYhhtWtF3%TvQ(x=7+vq*aG1N;L_(iQE&*
zB(NtLOE~1KNvTP4NZyKf#F7l}jW7Wc2e(Gzhti{)NLb%X#j{0U#{)(xV$0+C=ocAz
z7{n54Vi6NZq8F)M)K9{e9Nb!4{aPD%)!OB~vmT-EN#1{)$#+-ens%Cgnhu{fco%6b
z{&_;tjG{KTmSG{;ztz9|WBkX=kNMpif#9Md6M`}M$rX(n=Z%J08&ni;C<@T!%mNhK
z6r(1;RHs(Au*fy3HwiVV-Nzy5^l4GZ07@m}hJ{rkN<z(x%qtFbXTnQz%|Ec2utjQq
zU$Cq5IH+n4bx(e<zxFzqof%;Jz~;&J<z34M2Ga=h_vXzO1?JYXyoFpjqBHh08x~fx
z+?Ch<6@n&{Q~TRj$PdZ&&GlLJ>MfcrQXYx}@>Gh2f`vl6-xws4zNO{0=rriKNXtpt
z_^lwcakqIqh<F8ghds+a$v(kD3I~`5T*Jt~jK4{EqY-o;<P?MwLH)`4Q_3f{PYqw~
z`jIw2Z1VMU$mnD6#9_us#^DuI6}Qo>I!2k@@*eRHxcR%WT|WF6G?uQxui>r{auB@~
zyo9)<yi~(-$FjrHqUWY>*Idvj&^l>owV|-two%caF}&_y=uQ2sW9GFOWVrdcW}#qy
zr>?x1e%!h8n1H8b8KULd`Pf<Sh~P-{=K6*V*&P`kxrN{X_roW+&t#j`WWOeQ?CLgg
z2cl$eWaniEGM??(9b_H&j~+eS*F9GrpDu&79ku5eW*8tV9A}|svvgpWYM1A%*{q|M
zotB$UV3)F(zorzAl8venCK8fHDG_31%4X8?X|<s;>Qx$6GT6ACik~8!it%&v5Ai$k
ze{OefAMw(9s`3W+)_Qu-ct?j^n#i_uy4Eq-LFS8(N+^$Z#`%+69&@f5On0qAf6*US
z=t0OoyFedx6k!GHk*Sx7oOP=~UfWdM6tFkNR9f}cAY*;{?Cz{*{gjNClwE8o>O&Z&
zSiRVg2z^+|*VK@kFov&$A>TtZa2@DX*%FBs@q?qn!+20R+JAbiM+WwUA7Q<vKhUyl
z<g__YpeT`=OiKTn6lyY%CEiIXLWM7TH;QaLTQz{85!!+=z?qqzk)EXOa^idfHy)@q
z?2)(?tDJhpJ5S-Jbs-+6C8;q?y&9+2<&$xcq?y1SgO%>gbgbp>=q?OHLPoM#E2tYy
zxu;&z^uflg;is}XIsTyI@KOvHB?2nJpno-%37g5(PlNsDIq}B;SEd2BLY7@&Yvq$r
z-X>n5m1_Xy2s@oe;%VH=i2bPj*u~A;1=?Q~INC$n8f^%g35_LQbtebiGd&ibv-p})
znrZbnw-6)#lm0G;IkR<`=9m>U_zzvT!Of)mqzVcz?z`720*2mOFLQNW#yu5p^un9^
z)THR7h9&uJwk%s8;%4Nw1-usHYl|&vo_5-_Y_*`Y9_pK|_>|wPrRs6lT6NX;A735!
zEv_DitlH&|)|Z)6nH9S33cBgLT{5)kXgPaTh!^jbUlm!-mA8BJoP_c%@$q>q-~LiF
zQ%+F!dPsS>%rIX)=zkZGTJ6>7#RbefiCS2_N8C(WfGmNPME2yO=H#@cK9UzJ7q&V!
zxb8_iT;sM|9*hwcvnIUeUeLBt)mzS<leb60#~|`Jx9G0)8i0&!$kI<Q5-9euzoj`S
z-uP5xr?IK7BsnGMVCisc_A+%$KSp2Z&H8e@hI`DKVDfg%nqQ1t&GlsQ{_%3Ny!+Qg
z{Z{?Ao4XqzwjsmP3Ec5sGmh=~1x1z=Hg`(KE@O)}zeC^t@P27Tsm$zAC7W(Ti}!&`
z%c`8yybIfkZR3RJ)VZ<L?c<39pEaLVL#wq;^N(h)a*m>g7RKsR*_WJWKb|F?@za7;
zFE7U3m8~Z>lq!@kqG5hIkG$ui9h6OhcQ8k=W617rbbauj@19Xj_O|27sIr0)vlKHE
z`KH`eflCkfDQH<SX^dFyINmg#TwI!0g-cm2dbn-e_ens#*#X1;AiZ#1ACH>z^5MmR
zQ_I!5sNe_z_pEbo`<%Og;b37RTp}I;Q6G2q6d>+P(_{E~Qeq{z9+BtB-Hps3*7rg&
za408E8hBq0Sn%v#2*EaZFthd12tKQyqc8qu5HF!7J7;D!uxo?wz`B>f;XZwS`+h%{
zVHrUXu0*sEa<2c=M$P5f)e%3SS`6%L@9eA|9^d^V#7k{T({oGv3$9<@!Rkf_n3xHC
z2N;on7)Xm%1*l6H%gBJe2R*}qfd`v`L4lsYL7#V^4;UC^Y!DbM=ocOI5y^)5M=3N$
zHsn8^A(CDz3MmOoNPvEoKHCET)()mNj&(<0ctBmvnklP0s>?`of3~q=Ffg()1TeT-
z*}gUb<8|c*Jz4=A4TxQ>EUg{5UHM4=sKE_-e$8eiCH|v|qXi$Sx{N%ru#G){n4N)%
zfr*qKftZ+>*WSpOTT$fGU(G@P@sXN3I@)qGGP=09Fu1TX*w~veGIMcpF*30*varyD
zYS25lSvwlI(px)_{n^Ps`Vj#*e6}~Ubu_cFCVuVLz|h9Yk&l%0b)bL#{W(v7tJ%Ls
zvUd1uS)c_nzLqdDGcYm!vu{vS-q&1ic{5jlrG|)^6^J~bG5FcpS$Y4c|6e8l8u4!}
z)&JF!nT3n%?@j+!^q);t902yhHddf99r^#YHGeh!d*NRVc^O}q{x?$mN#{RuL4@W<
z;AQ;hrtu>HhcqI=zy!f0M1+)G!H+Va(^dMg2T&9t!l+66iOBPLpe6R>6XQ`up!QEN
z^T?qvDe|=P-atV?Lnc^9%Qt(3zTE#f@oK%DQyV$CSZQCKA79jR*d6mU*;#4Y&E&$M
zB`4o%mlyYi`fcij^rPa%-!(ud2LEkhg4)36_DrV!Z6~1qJIVai@ptGT0?kvWi}|_@
z!G7BtoG}Fc4*qEVcxxOOeDwcJz9>*d(pTVE0kQwH7?df<gptj{oGg{)M)liy6T!)+
z>_O7OQ2efjpFX7^z>q14|56r$T=0J<e$;H130QR`$^TspYSty~JKU?!;I)(STPk}J
z3j%Ubs&L|eTf;vEEFrW@`(c#lHzWarzZ3Kw79rASrv7aWC|2<I7y(UJDK5Vy{+S<W
z9R5y2`QO$6ZI3b~jKe<b>F-EFz6J>z$IG`b`?obvM?o8iv@>b4BJn%k>x2c3^8gku
z^xGQ5$?)<iIDasTFCIiNG7;7zHE2W>41J($GW;zRA^5M&PFmK2<X5Wt$AD$2b*UeW
z)-*s7ssTZpwuo-XiGC&w;c~JR!}9ihD$y^R*Vb@VUcCBUb)6KQ?)Z&>f%sP@6l6kM
zyL_K4ndWw#ASi7~1e%y8BB8@wk67g3uAV1N5~JSDdkTl$@uVzv>#O&cj?nMeJ->K%
zI9oaDFnrS6jVjjy+-A1z1YZkAr*YVryWb8b(5iXLo&;sF54S=idh$n6K~eb$S`u|>
zD2~?gK$H!f?49BKv5~$Vs38*Lv_3w&XDU={+G6!qs|<V(R|#4T)?7qklgBawFQlcv
z%0eeCi0xY~#!^{1hATHmpbZs)RTnaV98psA#DtX+2AavdI6jZtAUv&Y^KSzNLGKiV
zdT=<m$eLWwW7QNQvX~@P%P!ixR4Y{mw!rVJUc6YG)754#G55xnTdFPlB<EY*!V9x`
zPoVuE?k}ZwWeI+u96r;TjAz6Z{Lm8bFPoo<t&TMsO3{*EjeNX}njO-u_iU1Ww5*<8
z&wxBxs?#cvPGcY;AsGv9B!Gm6QUnTW73nm1=OuKHYSaKmDmy-DdPeM>qrug>r=MPT
zFNihmfms5_bPDABl2{S`poHKkIjZS6Ezm{BV%dGyYuY8Uh{3>x+fP)g-Ptv&;lh|6
z+vlmya`n5CYK1n%?pRjTLEgfKfZIj<2cM3F-YBBgdd3n}UAKDxF28*|oksJA5{7k$
zR=;Wf)*ssVHjpgoCJ^Tv{cL{5qnTs@z*`4SgV~eii`msh{G0p3PmxFSfZnPo{Me)q
zW{SbbA=bU21X}DBxhjSt_kk`e!R0F`HVv!Chm++kM~gm!iS$pJEC5%TXD)YTZl`^7
z%m|B)Yu)uHYTU(EK=>6Nr?~k{5rbXN!eMUjfWuRkV0LZPAmx3GrtfUAhVW0jZ&3Vx
zf+Yh~av8BFZNR}^yU&nBybga#p*0LYL3drzZJtWGpQK#2#jzAyEh@CqWj@c>4UYRO
z!a*nmq+AYLOh8@tb5XaQ6q7vf<p#3*;geiEKIbyQA+>5(yIXLwb<>UB(0rjMck?f7
z$|b55baK4wMC^{Q1fhX>$v%*ggrpu-V>LzyvR@dH#x|~BElMCPQ>fk^BfG}qh;cgP
zn9HKP3)3sq6&{MA%Bnq_lc>;b+p&Foegd+%kE+w?wC7-Y<`y{~PbgJrJ0NY^`eWn4
zV~|Hj;IeQ&#`3+ia@EMafL}ZZTLHRq`V^0TOe$6=0#cYA3WUo9Dzt2*92SuY0NV#M
ziml4H_op3kl-k#tb%5@?@lo#f^tHKDAIpUIGuM3$wrKb29i-aLSDhCZ-GPWZl*t`G
z@f7O~DR4RLVtcOry*Agr3HxVtjM(@*l0aYX##w5xNTqYykz1vHV6l62-Jhx{Tx$1P
zB<~Q}XJIi}@zhd{B;c_=fR)}j20R62ZT+a;hIY#FIBiEhw%RO^DPRB$XHHFz10H7H
zuozxZv3Pfj)aZUUXT#-m4bd2=gDtMJSvRDR<|(_C{oLh5FQnDa1XjzKXLH=_G`Y3B
zb1_jBRZa|1nzlo)mEs(B)4H>GYB|t&;R+8W39KrFI&@mXUPsO`yWEMT)dSCU6|g<a
zrk^|C9LTq;*(n&qqB$=pUUiwdz|%{%JDDj}1GKLcevhm|ZEW*Zu`RIVf=ax&8OO6Z
zImSVsdThIu;S=N4lg%*RsC!;_Q!TaT8%p8msC9jToNuwoavH3iIN<{(dw8BeR7#~e
z6F}WMhq*vO6cbYY<nOgFM?}PCv`gWVyVaSNYjoUSuIwnbnd^^cXl&|i<2*eK?5!sJ
z&d?3467`pQy);}sp&<6%wofu<Q;5qBRxGal_T$!{AYssc0o%Pgj-S|xByX17pieQF
z46C3FmahYN`FeQwp~?>R;YGh)Gi0%dA~;Y3!s6?_IG=x1zb%MzeR_D~z*~~la(lAO
zdMXBxOk$a>dG=xkl*H31jYX&n6HR)5DWoFkyuZZGbC|WhaE~!mEHXV04zfz3QPCVr
z=OXKPdTw!Q8fZt#yIFrj_a39|+m>+hfsIN+-Z-C0(1m8<a-*GayY9o+4>k+E3xGuV
zX0N&W404&9)~kaRnGIgF#pGw}r%Of$d;*iDS_bV_Vk>@AitmPfEIZ9NF%gQQ8-*pw
zrK)vd@Q@TzX()^E@ev9n@^IG5L=oXLMvWSAE?X(R61D3$tlsdpe676gw?Y<8&QGb1
zrn9OWec@D3Fi0%J6)i5pRB&AOSIOyoF2Bf8+KDELV32W1ZB&X$MG>%L<y17_D9E6C
za#n4YRkNkd7Ypi?OOz!(`LKMV^dda}RzbOebAfV{P1pG<nVaml;we_?;)e8OW2Oq_
z((>Bx4DD%j%ZiGr=IqwiNMJ=T$`>1?+bb;as<&$SF&|Lg(Xo}P#f5E;jHGip<Q2Zh
zByrXMV7a8ME0rWmDfg&szdJ@>uGydvKKP_qCXPm<ToesITPE?D=6Wa1*em{5VWp$E
z>E@?5`4$6__R@zYaXa**%#VjFr472DE0inLg%0+ydG8({b)J+mxjg#62<_(B)fgWP
zT!mfj&nW52q<s7tb2k%3s9iq_H7`kAhXTy;%YimKU2f<w8c)<YXx9-_^|@_B0=j;o
zSKoBe8oWo`1$56bPu8CETok__HYri@%=rSJ4+wYpff{Q$U!CgY_-+FJc&R+NRkz9)
zNqOX61U`%9`Y~QJMu>NL@msWK;eN|SR?z7}Q{2`_(yaRZMk?yvPTwV+Rch^IiwO#s
z&vUwhb9WoK(b-RfYEAlR;LQom6YwTR&j%>ee0>CnJkRog@KcRKd`m~o*W$z$CMXr-
z^^*BHt9Qx^`-G8F=K14BZ{+6(c31Z7=dKq8i@N^tM2hf;X^#e{tZlD3FM<t)*E17>
z4B>avdJ!lw@~d2BqD~GdNRV&P15@pIk1*9Bgs<9B?lHb{p`0T!S>Qy=b-(Jf)Arok
z@sc{ycnIs}9^{sbaYn%i=W%~QF*V!j@si2K20AYC=o<kXnX8GNYjCHIRnd=*qg3cp
zF9&SqmMbrO`(SI;C2k|Pfa8bjk~mYkI;Rg~93zQR!3@h~dlX9jvk~CoH40($f*&e-
zdvDOFQgQq5M*393X{ot(Ml&pe6iItli?SN-N|XaG46Dso=~J%mv19o>*i(G_bHln1
zY}x=HE|m&>D4HC$<L?o%hIM8(#*-iN$>lPa(qRq@XI8x}eoYKgdfzlTngYMon%`&6
zC$aRsNR#zqqpN$1iq!8(=`KX15XffmI0S8tqzxRkVAzWlct5A`-fzYATjHq3t$N=E
zu+K$M{;1*>PR;VV(ZC?YN=UT-Nut>Drqy+8JZb!BJ|82Dn7@_SMkW78fu46_emK@U
zN;q3#dPV`YK;;pvAIFAkB^?;PZeCMZ^Z;K?sEu3Ahbzt9QpsGH>+Su*tQ+q8<VHY+
zW{>eW)&o-*9%F?91Ipg2+8PGhagOt`W~_jXE=}2R&x1_|{Wz7^iZ)83$|{|fafO9W
zO#!l7MWKOW2|6l-C@&q_IU8MMz1Fl&Ype7ATo*oL`EjG^5Uk$Jjv3FJCIV8Wjyr91
zN@-%Yk~RS0*wpk#T(j(GK@s>?otAoUk^9bcZ>bJbVLMo{`J!C;XqSVZx?!#7LH9y5
zQM)K7Bb+Eb+$?Q3sTyKX`zl@S#(gY9=`?Ynj3$vs=4M)|_0QKy@r+u(jvPE}^cuy_
z43Msus+E1%CD6N~RR;?(1d>QaSE9cKz@{Fv<gdi@LL&|x%vg*v#dS1T#YLpBNCw((
zkDfoe*^f{#QlYa_o#_dpnVws-PN{WWf)2?%O8)Y)*X#k$mOOwnemq*~0UUt{&1b$!
z_W8EsWj{F2K2`@v$m4wU;Z|cMnCRmu!Jn1^A)KrGZk?=HhlQ^<oR>!M9zW5}10usH
zyUqO!f?=ZQ3Uyvjr~u31_DFmT=|g*!`kSzY)>Xx&e_R^)=;v^F3inHV4wTc;d)rBR
z-}BFGoWO%`zq>z7;OD-(D5_ZB6A;_ZY#({LzFoC;0>haJK?$2G^d$LJwO_k18|aUI
z>Tg>PlcYMN=PLA7=A(e+BEx8cI2p+9%lqbuidL4CeY8;dygpedXA)$Uvw3usYqfi#
zqu3y8l`Gym&zY&FSM}4C*yHwC#bImKGEC&C2QfEada05z&h@)?m1zt<a@zFv!Sx}W
zG4~vr3T1&zoMu8Cy$tWK4&`FqI#r6V?`h7v5&Zf_N-Nvni_N?cK!tVEJ#?G4A?dH;
z1tNMVQyTw>eJk|UXe3qCd-7XJGy%6KCX2(3Wo~xg!>xsn@<_i!fAcYPn-Ek@cEqd~
zQHge>shb$K0Cle#oert|Uip(<#`}4-LHW81;0^E;F!AWNQ*bJynW;okmoF}Imt%v;
zE_uzTd7Djqze-yvPF5#NcW*2~Q<7LwrCW;`yxy0^V5K>KcR7BLeT&ECv{U=5#*K20
zO-1viv`2En&y{R62Mt!y%uDVYV@={?`!z}(VLP=K@V3Eu$mYQpn59XlNXY0XXil^q
zGn{(eC6-~SQG-C|IG)aDao}1U+Xu))UVae`{)wp+Xh>0)F1qYO!Ps_{qup+=!IZw0
ztcgp=h0c9G0Wz4Z3L9G*PM7MWSoSUl70Ea9WRDl;$NI#G;DXZBA+|S>RnCOptCgAZ
zlXkLwI?oZ!_iU<$&=bWRhl2P{*uya~7!_Px8v1_cS$pd%DKf5;%WB;N%Kmjg97W^l
zW-cveDC(CkwlWgqRurwgMsko>z;8f|_-CeC=87AAtn2ey?opIEbb}WVnE=;bDDluE
z;<Cj{p~ZwCQ_W1V5UVvy@SluX=E#4*@;<9&*RdC2Yo2_3E>@#indaW`;KS!U1<Bb}
zIt6J-Oj^qCDrkEWu)f|j58ya&3?#M@wxI<)K>B*GIodwaaIc^R(G9*%xl+nr*eKc2
zz}H+~Wbm~7U!n3?hI1EtXg%9U^RRDmy&?6{k9pJ3f}mGh-A1&Z`pXv$WeyA}U<c^G
zd*ZjIUZ_Y^L@JY1>3u1#oLOh*@p^=%Wl&dZj^(s7<iQUk0KzXilh%zs-0BTz;M?p)
zo5&dw?P7434WYM2d_1EVj4e@lKcXbZ2T<zP1O_;T;wceJ+*r;l<0i{2c+Gu+DKg_J
za{?LBW=eE|Zo9IC=Bx&+{@?g06r_sZa(9j7R3;VK^jsgL<uyF8<1>U~k2;tYBfd}|
zg0i_(Y_pYORJ^z~<0uS@AhtA<V^<_+om!6544kIT1K<MKO)DM0k8QF%z(Z*QtEj$^
zWG3={Qw_eknEwJ_{qf1KLiHtm(=p`oXH@WrdYye`j%42%RL#88-gKX5l|MS8)xvcD
zOQDxK5Xf}1xVmU_Xp64SA4KJvDYWJ>*hH{};Bd3ts-ZL#%T+_CGP0J>jRbwdwWg`B
z-_gD<I^-q$$~a^{bhw?ONrVLwcc-7i+ZtO=HP;O_Xq&IDwAZ}mkg?QW9TaQK=x}+v
z3571Yh-48iC9@S|g|Zp!!xh-qko4r=niIb#-O2SgtbRDXZ|K=ku$@J@xI((j!?11K
zBeN9QcHod*k|*fTU}`WCr;I3KQb3|(<gi>KXEpi35=4UNC=x(L0JZ|ml&4_Cj5-1N
z>>_*Xrm`9l@dySWioINW$R0T$ZN`jn8lcuTOHO_L>}4|6uzyN`z&*+BZ0RqQd^MV9
z>xV&Y`l&Z%58cpR5e=8v4D4Uw3p%39xyj-vIEl{V>6I3!>Aw>ZSWD~1QcJ@d27avo
zID1Jv3DLtE=!D|tu8o7mfr%(r2i>c59<@{!NSoy}IqXKJXoIQ7V8(nmhzq;BJd&>}
zFrP1}tM#l5j8_d8l0v|u){hVUZWzc9m)*)^v<p1%4fQF{uHW{dk{j+g2;SOG(q#ea
zlcEe|3v(O%dY)Daofa@tx(tXy<Gg;-^kQ>XCdy0{WQOol8Cin9%(ps3iLl22f_w?q
zN|c7w>}9sQ`@L^fo0*%!wqcRi9d?KF6LqN~TEJ_?;5I~;3wgaE{C@I#=0j~0c;8iB
zBtDD;R=1d6r?PzRM}h@x(6?}Ne%`TkqKMJk`H^YB(BLsnc&+!)c^p*u!DwQAB3T^C
zX|Y=R<pfBybx>D&P}5LiCk$Sy<6$A%Nbb9c2*<tQiiPFgmMy_!>@rYw+<ss(jj&1c
zrpuF8{?M+b|0`gogv}V8t@M$1*`?gJoz9KuCJ?LV)oUp@jRrTpo4`Yw+zFXzkGMPb
zt6ka~!!heYZmC!};}~%GOTen)^H13BUN?#`Y*wLvik71(RGq`j$)ohh-5&n;x{bC9
z*QDRX&MUsa8#X)bTY6v4<jaa>%)tbyXF7s$M&9!8$*vJ1`%74|+pN&ZOsl*L;u2;k
z(cQYwizNE+!*Ae<v7j#MC&%gl6f313mN2b>KARfb@!339=CnLfXfiHV0zv{2>+p#@
zV);7*Rln_##JEy-GV^lTW1nY0&3m1K!CAdvS!w^fZ2huq?*M*6=E>0q_`=NO7*@4s
z|N4};o|cM2FZuY+u)b##0ouZr%L!Lz@@D5Y7-KBjQ5FQT7qo<>hdv0&-kSrLeUvt#
z&moq}FmIgdS5gV--;%oj5ahN2xrc^r>3trnPAVa2+tNy;_rArY(5dth9=tzzM$JIG
z55$h$ha`)W;C{3=!~q(X?{-+O<giXhdQ(ePNn4h)hJF4B&+E80*l~6kIDN6Lq0wYt
zcYe#IM5Ww?q-V?Ki)*c?BuWOwnkLkKXCKx%!L;G7S+33TbP=Ay3s<w~T_F2H?-!fv
zleJR)iK?ShdMvrvZ7u{2xkrLlwa7F+Jh=$^7%GmqtB|yefhH;m<jX<Pqgfwu4WgUR
zvxv%sC|rIq)#*1<-S0Wz+7KRa2>2x`9RacZh7xA6!6)mSF#t%N`L<nrTQVWyW0lUp
zSpO-?0wLw%gP7N%pm|B9JoRy*7n|@}4K@ZqpS>sHfECSu{fCOhGz)=6-pdzn%uX@(
z_vmj38(tuq2sa@|iqMGCp2HF(KsGyVI-}G|u-CV~s#lc@l?@8w*lqim$7YgAAhh(C
zCoPk5^0)RypQWmGl8~C+A4&TMtOOX|wUaJyaXRB?Zec@OyrT<ElBqH}h2cYk2woWv
zYUC}-3ON1#luh>5VA^WyWtSQslH^Yt@hpaN*Gcd#1p@Ggm_5V*j+Ks;S?~Q@fk#s%
z@*Epa_EElQrym1NNVOm<oc7fU$Mj>`qx(z>!&wyE&B$w3gdwM2w0l+A9*dArI%rQ&
z{j1W2kw%UEsvu4-BiaydZuMb9t<W1%qYOjH5jhR2JIp5jfqb}9xJ2s^X)JjHTU7~T
z;A@=>r9t2&py{O(<j3?93tla$!u}|C&Rd=cu&rNd@le~;LcTYfuF5Q&(k2x1)Y~^=
zNDe75&S=OtoH4f-4)*i8pC@;yv2!zw#vjbBd22G9+y&yiahB~%iJgMg0xwu4HD=b`
zIBtCXiPon2>R!4Ud4}5KpF^WYx;5_C!PQ@a(J=93v|6dEtcPQ{;v#@XdGKoAK606@
zl!ac+!t2^^{U~&53}~&yxK@U<P=ZF>nuRAG9==McnO<GOVO93tix#T3xFw`fsqlaG
z6TUNTsm11m&Sbr$z1~#l9@Xcu`8CQkvZ_vr^T-hiG>T8jz_R;bzA&mzd!s->zI_h5
zif+~sY3>B@zui9Ds0?LP${RT7eW{$6Jtn+k@~m`o$n7kCk;V8t6povWK_yGaX5L{%
zx!?UouN`h-{!Z=_o?iQ_&zR5sHs&3--z4mTGX7+h5G1R@#GD+J_;(H|Jb-%|ie}0#
zaikB6{g?0619^<l(}h3Xo=n-iM|+46Q4=@BiGLn4)2X(YJ=H+97rx5Bsuc#1{7b~_
ziTO+Z)$S%VoVGGKclKtnPOF2%K(1EmP|Y|xhSWGJ%@X1`o!7r>KT;X#i-LJmb?@f#
z=cW0HYArpnXyqo}c=zygj?@)w{i!R{`x@aE_E@!5Jp<}55;IJxcIoe8$)ysddH2E4
z_fZ|B+heH}KAN4$e!hGT98%@S<yvyyw{oYCSfJC~db8L*(1dP!+=Hk3eY#coc-LP`
zUld#*I7&%h6nf;n8iZBbc06w@6Vs(Aj+*xZ(_5fT#}na7alKdaQs29F<Hh|=1Ufp7
z{Vke0i(j>%Yv^9BIQH^%)#rL|wsU!#DITSUrzuUkXZfCY2M@d!QQWBEP5kD!9x?Jh
zT1Gsg;Pw22(G=E;BZu+%cDg`p7(JkMQ5_i}`<OpT_8od)Fq*D*M&HG`@f)fB0GP3O
zgSKtDyKcCwj9*N<o7`@Fczsfm<Alqj^5(!Z(*Yb~tJB5lXU^Hr$bLoUMD6JQh<@Y;
z7hUu4jc-vFLffv14ye-=JT90&XArHRkEa%@Fq^$0u|p2HK_qfp*MaQuzB&|YJpRQ+
zXf4L)H=q%<yzD|mvD=CwfWVh2^@?#lORU-PSJIK<BqDa~k5$y)94*u4kEq3`Och9e
z&k-HvX)3*SmU|+W8)uT^2B$5bXu66wrlk)EMven;+x=h^D0guc2zcokSrDi-9<J}!
zl9J;F!w-HD8KF;iBK)+jKRq8*x2n}*cywV`hw>hyP7>VPh*tWsyufhK^w>QTOKr^T
z5zx~tmf8q?{&c%4Q5Wkbl_`CehpGIc4n)EK;7!t_)XJ%swMf1e1;tOuB}plro~KtA
zHmDSumH3sr^}&sxp3kJ;F*xPd6Z5e&#=pa5pnKU?mGaYa0tAmS^+#rR=o2}~1UlUO
zlrB^g93Ld2hnN&SQb|Uv-Wa8nqCg^~VG(t(PM36^?+v<k*ke=7OVmtSRGyPvOE{6{
z^aDkiW7~G<XJQa|GYih)nm4)~&N3`Tt9=&-&BVOn;y%(mQzHKk9D*eDnup!WBebIE
zGE=_zc}G!`I8x}_Q{~-pz4-SDw4cl(nKUmd1sdLQ@G7`}BaTWR7AXho%9cjDa34GR
z9bzg10WRB#4v&fp{qL&<9mIbms8`S~;9Go(*l(e~5GW8N_X$GelhD7<`ESr@7YKyx
z)u#sR#QYY@Bz=V#W58k*{yFr2J_s^_KviZO$rQI&BkX@BKXDLbYDg|<g!!i@;eULi
zBE4daL5jwNzef;3fxIa=|KB1Ds>Nx$*DV<d;s3=hk)Pqf<w<04FI2n(MF5cRC4_>|
z)5N_mgjV?@h_BN5e<ne4^fjhk@h?L~zXke<DPLiC$|Q!%Prqf67;zAb{J*J}5R5oh
z>}9tWFJQfPtE+<bN;Adn>1x!;$fditG_h)#`i0TsT`ai-dMGwyCQ;#sgV_EXG$%_)
zKI3{6wE8H)%0AMvfwBQVz3>!BkWWFMxO4X{2#b~<CrcHpM>!qpLx6{ecQ2p)))P!2
zb1=Q4gh)lM3niDyO(Bu#_9@fjq{;T9ty&cZr_Xb%cB`X=u5>2v9%dK_EY=0`?PeVZ
zjH8U#^}YyK5!Mg_edqxQulz3fWt^73at>J=18(PLJ$1N8jC1tffD&y1GF;?6I@uQn
zQ(6C5Tqh_ein%YxWOI^Bw~y6cXTtHPb^R}6`h3QGHyxTon6JFh*#mXgx^a21uGz(h
z<)?2=gav7ns7&cDJQU7^vk)OEMgr~vB`+qg<}WkH?_b_amZ(=B6=Juk@hs|lJ=ZZD
zP)zd25r)eGA@J)C>$>P*e)?8K?eY9xln)<AWYizT9U+5*@!_C8>%jN(@?dgvv{J4~
z>2SGhwx_px$}y~Bg%)*0s;5@>9`s-J^~#A)Ld}JFpHqxaWO3^<lS27_mkwYaM=QqT
z4Sp3mkD-E0D=na$XXRZ88u~KY6RGfFky}sKKcg!;QQ+}gT;(ElLW>a2g@RCfC{#Y_
za(l=r@)AfXl7H)ZhUC^J`wyAp7Yu(^zHdOeB>V?guMI%y$Kc?GiR{vm%J$0qxxBy2
zDO(xe#L$ZQGq^uak}nZ@=g%Fnm>J;dU@D|}WRW7f4lhm8b5|~^9hD3!P$6fx=|ANc
zYA!slYe3k<ywT2|hs_WH{7$?!>CXJK&tawuR-in2#R`!jkJt6{&q2Dt9EYWPU+w>%
z2on_NXv$l^KV&g+HXLW^8yh^~K6P|jEJ}HJlVV`mlLa!bJh^5{eHrHWWBi7<L4RO)
ze_so+pFBwFS5tyy0u;U<ZjJHKI++**!?LvPG-{)bvhTyHDMXJ3H-1#=2PW13l-6)q
zudzII6hAy+c>RQg`C~g8Y;I|YxT*nhx_r$*$FWVMR*<I4WqanUT&z3<U^!W7qs{VR
z7Tp+#Vk}n5XIVEM%On#GZzZ8o%CmJNMCsLrs1nz1<`IcJKpL;ebKhya7-sz*5hY?_
zTJsjm8-s>?4F$@7o&OXiJq6~E+4zZ(b?(mOo^Q@GrwLZoqG(d~AGwb%#Rv+b-2pC*
zaRm%w7e5R{5@>lpoWmA_y!P}<Yu|c|##h?vttz0f{eb4NIOZ99+^br$OCVho$UQG<
z`P4T<GQoYO;iKO0NYTunCidR~kq+Yf$B!TVS45}(I@VZmaP%*rSb)g_35;(JZi=>O
zoUVJ(t*I;kdB8&2&H4GD0*hBaQj`02YU|^q)MT+2Pq9K<s&cWaaOwv$I=%hxIxWs4
zD;bOVpE8;!cb*bo(UxNMx>S5VXX#A2RApVaZGgu855*qmA&wEFhpYW+mlyD%EUKv5
zr!(Q94BB9>n;#e3)mw-8iWQa`xUAN#y!S?)X7ZI4AFP%*i-!A%SPx}@clUET!Fj?o
zAtYmHI@}@!c!7!{r<P+YwJ5tE>=!%WU6rDqcwpNLBd_ONjCliIIg5}3oWGLL1^(>e
zS(+GUF_>{%XhcNMl$j3X<%mZsdbwG=a6VozF1px~z{qp`p0lGlAe$cD#ogOdg6#Ed
zevo9m3cu{~)c-~zo%1`f<tv8gRC){88t554cRHMv3YN{-E-z65oQzhII~^!r$W9g5
zKi<JRJYH<mUR*M!0j9>&j2@GjK~8yseGc2Iqq(Lj{N0XD(zQvo99p6<Ss)~BgEakr
z-fFzM5LYR8uJ@M*<ftSNbqBv3JXZX<YL*Fa2Lh_ze-SnGhk_@!NTfrSu)L5ZGb<A?
z3B{m_gvZlLs%%eHWC4tt`b%fBsHAdOk$%Nv`>@DWZNl+PiA}FTYPDF)4m>G3?ThIN
zmeS_K#tNrZt4y4+X!qJ%7@OUye?jvV%~_iDF&ME+vuTnsN}Ju}`GVqp>Df^0dS-_f
z5_)?-&=))VXIxqcH-x9ID1lZ#{vDK|Jun+!xHc=d;|pcT>o=<_b*e-Na=lXAUu-*Z
zQe`1Kt<*pHZ5x2NCW3SVAztQfvRD*B9eWcJF+}3Hh{Zn5zA7i^n>Xse6-8cR!I55A
zy2eis;KIbH^e9Bo)^wlkcf61|Y?fnP?KZj7Q5ki4o4kMpRzzjE1W>A0@xJ7~58#I%
zM@&fRPzXG!514_Qpzd*_Q;c&y^ZMzxj1SyF0AOBkBjO-*PykgB7G2F-n!i?&r;PUL
zBgiA60d=zMkgnac=z6-`W`dYX;b1&p99K6tB6}U2tZ}PkNa*)7CH=Z1j!|T>UIBF4
z2romJ#QsZOIzgpNJLH^vj7R~Eb^*@`)Q7ubB-)=UjPv3KUSzg;Q0OewLS%)k3|j0@
z4!H<C5(gr*F@$D$YnmU>{d|H&j^%A~fPbW5(0KS-1~kicJowp_wueL63D?U)8I)>T
z$Iw>Uj`1CyC~T}9Lzo_m2*05lR^2&eiI@y^>f};L)pL~LmBXOkfDVIyK~c&C9BN4D
zaX?}yS`i#?AZSwNaG3Aa<-^{(2@k7Pyk|9W69|>QbV(Zqg@EMA#~4x?I2}$W?)*p{
z@+jAo0zseKQ=xH2k5su7QdpXlpJaykR@CWNgVhZMYwC{E-|uezs(0YD{_lNkq(lYj
zbiGJO4g};oL|l${RUl38A%^8)d?GiNT_#m5EPx6Y)(>Ql4VBU;!H606*za2a8$qtz
zUXT!^AN6!FD!OG2J=~%_zt|oDfdm`b8w7(X*9BgWo}+fut=mAR94QmsK*e6?mACne
z?e$UAGDu`mm=G(|o8nE;M55j9M`D3pd9OO8ZxTp#xIo7YfTr-%w<K!E{6LTKmn&-G
zBUn$GmkK$l!59cM%Oyp_G_0TVBx}A%*)#ANE$CVLPc>9<3mv3d+Pkr^|0&j~Dc3A3
zAB-$VckvGpCuS+e>aYjsfn?hM9q$PSGHf(Z{uI#=VS*d3kwa~69G3hhb%I~eNa^0T
z>8v>7Gt-aR@#uzX{U>7yzFvOFuZscgEv6qfhS+j0?fXq+>OpCceEWAjxse#v$;OC*
zVQ~4cUe2hYVktu?Xfh)o!G}UH?3Didp3ue!*?d5a`l}~jA9V4UEqDo>m%mgWWP)G#
zG4UUrLcaZ|63-(AX*+Jn|30i>6&i>xv68%`tmB_1^KoT-(ct=<(IecDtIYQ~hOYA~
z1n-kR2w8SWc!*$ZNM(|_zsbaI`%7t>&?<Fhd!=WHCq$R)DRMurnD-C(NE{8dr6ks1
zu9n@+>j|pOfuzgbaSwt_@PIh51GodpD)>dQssz{Gi;){NW^3M&4!9~ppqd*K+n>u`
z#{ivmPlhEa2x8Q-<DR6+LY4_^P3vw-iC7T^-DZ)%9K26m-f$e`(`eHP`WW?5`m}bd
zmm@b_yTSiAwa?e9^~3{%i8A<~{9WS;ys-iWGv+L{I!T?ZqPUkujj~dTf?=4c?+O)T
zF=ld`B7QUg!gYVVWf`#~BDEyaRmBoO?sp$`KN^^cllVjB!vme&yVhzIm<nDI-{VaZ
zat0Uqlp<47sg%7~7L@<k?SK6%h(z3JFvgsXngacw3eZ0_DNu+9$aaPW=>BnK{x=_h
zatvP?^|K%e3*vt>;a@qZ1R$F>0(rO<=HHp%Z+zGb1{$c}_oETk{~Y8slbHL}>?MeI
zd-MD0f<qY|FE^ZeBMrjV{LQ}q&zYh2K^X>q`2y9(a{d^Ic=55?NFgG1;$Kn;#L$CJ
z`_M1aI7*^_S%l6|_1<S?%YWczgxG$ZQHR~nj?3ZLjEX6Ha?sti|D>x3kDo>zx)_kn
zP3W(Ao+S!CK_|7n8MXz@LqXb?A0rosDfhom{9jb}Oa2HlHIo%%M`3?UE)ixB(f|Km
z{>DIm(*OT)x_HM%1;C;VE!`tp7HibHh8`_9jMo!Psa0sVy&s6m3fZ5JsD51T@X@1`
zOSw=#jBp@`tR8j8{oav3Fk-~24h65>>?m<V#7PX&hwN}64|<Om8*~W8cs?d0Vly-%
z^>`#mec%PLtGodASO%pS@=4SNt*Fiw+EC9AEe^obEiK?VV#xk-=WCHvdU&#nB~X~%
z=5eA=;Utm{B9k+hkWPz5Y%D{7KqJX9WVZuj!03L~0sn@}CY$hNwPnQ3Mc(qI^%*E;
z4y&N!9isp@TcOA1bae4{*2{OWQC2#%C(S#l((QU$?vjn)rhBvLYSrgus9dW-8|F5C
z7WLTUWU)h!L9hLY1(}c-+yer<Sen=SvGIfXq_kq51QpR6;2ggIc*pZ*v(v?)O$W3k
z|MQ(g>D9^Nc;d4+B<v7lTEkrZlh^i2o5v^IRDLBa8l`WsvqNp(PtC2X&0kv+xk|Ta
zmDNF-2Pu#9$<@Q7K3pTn@X~0v&tEM2ct26N3Z~I)*+h>YKP484|J_RLIt)bMgG0Bl
z)z!&Onl@fSW(bq<%sB3AJjs4bt~}Er#aM}?@Fed|)cFIOjyQiC%mfa}_;@-l`=M+S
zB6#ko8y3LAhasgiCk)D(AUCnBgx#@>8tC(FU8my(HX7~LFRk$`32Qyk6d(<i6<D?U
z%;9(kM+=uEB)RtUnXiONYn%NxsZu*0o(yI<YL%~Xyi&7W{+Z8v`q=H+k*Sr}RdRKY
zq|NBg*SgwbfE~VR^Wn2^p)0AoXAerL(ZrD%8sGAIE9l;W8$Rm)dWZ~cBH0MN{E9bg
zbhV~?v3mYl!T&J3zDps%o$t!Bs$;Hpd|>cYz@}>TZS)~paqtvXRKmOxl?Z1CXgkql
zlb1u2#<Dn^j*f~471V9+0Rfh)H!aI=4f{(to*01X&U;IJ-^M>*`~0k|?o|?aV=&>;
z)1<zDxbjlYujV?;tLDXW1vzos!(;jyV_R~CE4_L;no?80$G82Ks0tQ|kd#KH;Whp_
zmc+t&e=@glWwYnyjLqYOH!gSy8eyTLD`AZs)pocS6ziTi8;+F(Qr}Y09X!7xJbq07
za4(h8an>cJwYE!bYwughHe)fvKK#>hfAS&zEK;X=7|RQ;?8&+A@oB&&78J`<yqFy`
zw>sz*#smZN?0u}cKjq`Qn9g^*<WOxhi@ZPGo&gWLZ4H#o035C^E@ch`kfZKFEjS-h
zyPPg1ly_Jsk^yfef%QbguvYx>!oetMW$T+FaV}|684Gnr<2N0E;jQuQDT%Bhs3(nT
z{Rz_j`NFKL)2YSUT$w|k_$iZd@s`MpGUAPqwi2yz-PpVRLaSX(+FxF8&R<#$GLCOo
zT7HxfYs1ihtGCPGunk(BWJx@h(Jh`ee6XJQ^`)iq_I~}Tc{r26r-^So3z>VjE3MY6
z!7%iL&7$bT8$2pGf$Ip4!aJ*EjDZ(sfb;zYbcYNX2G%9KNFHHiTD9OO>2%!7brC~k
z?6Et=cAa)k^v_pOMZDK7=}K+~S7!Iw+aC4&Q8O<R(Zqi62gO-In|C;^2Ah;_{AP;Z
zXAQ0th0{ye%q4~~X{5Esh{C>4sbnDqz1stNv9Bv}Re2OzKH8PzzubPgI$E6V4fM&p
z;0A@lgJRkfM^l(}kWUxMR?9U>(!8HFnk@u=YUtzpw;Q$dx}~Go@xZ}DrL&yF)-74Q
z7~rJ{m?W;got-J2%`0UG9nFiqBtQSx&~&r8#?aB^x2dY?>RlI5ScFv!LY!X)+@jyl
zm=6YJdB<uv7^iJ!(k<$T@fz(hT|$G~)gAs+kWIgv?S~fLw&hcwUzZx#Vl13%S8nDp
z4ku1euv%<LtJPp^go7x>?(MLCdaxaHtr{5?2>R}i;$1rHNn6$5doOMhu1Xa0gq;a%
zBG78w#{r6X@3G(7W4<^%?MS<t_mR+OH2tnzsWTwrOERmEGQy#bvXM%i`y!hO+VZhM
z+RE0P9wK(}^;tciJAa0o&eW4bo5Z2D=O<9KX~e7Iz~U<C!&R#N%gT1VS2Uj4nxozM
zE8a_S>T1@9N0TvR0{?(YJx~xkrOix{S-b7GCG%--tzSjZzXZghBuSoZ!S)x*TZ+ky
zg<rlM*&z>CO4PUzl<AebGar}Y?r6eI)yU~J^n5CJ?a{KHYn;%=b9(E~{VUH6(c$>v
z^n-KJt?BsP<Xp3rhXej@bE(6xYx$S6K8$0WYjkPqF5$J4<(60mnRojW<)y7x8{y~z
z-kxq+2|>|1EiP#knRqFzfcW}yfoKzu!#1o{L7ltaf;Nsn%QA9v^$tERPp0%X-@Y%b
zq|8-vGEbIX3Y$Lu{4+nZfoGH254}cttOK?QE~iA$`$I!gjBTbHavARv)H_VFynb;}
zHw1)qntT>e49^O+-?@?~)~jnQQ|F%DYL_^!u6A8<G+S(qj2}(=g>-)WGbht>ay<wq
zc0OBE0>e69^MtKLclZ-hq77%E;>6Kp>1V`y;5ck9zHOO9sY|W*_=nk#$=``?v1=)8
zdbwYKDtc*`rAb4KxdI1dvlYQN1E{ccY?MCS+R@(lFO5DXTuu){L?goig@**d-RLHm
zZkZR$GK~%|U?S4NgzC=t<K+64%b_YJQqLFdCifw2jM@50lU0Fk+l7@3t#|y(3+xH2
z^^aSq*c+cb^;x~|z?!^hd)Hk>IP~2X8kA@q9)e}eN6v(nEftz*S+$6<X>ab`Duv#%
zN>A|@wKOU?>q+B~WJTebUxc%y@jN*=JhU(KAYJB)Aur?v0B8;t5@k++rl6Mu*Z^Ou
z)<PZ{J(6-K7corYUwo~tM(+n_$2#g4Cf9!qpXZ9@iM6==xHnac<Hy~QACC!+q{mC%
z<m3J@dvlv5T#KU^Rqw~GT=>xd<WKVq-c;4{Jp{)eu-R-rh5BWx+8*OQf-p0ChK!A9
z0%>(K5=v%uT&6e6Zhvzmp(UP|JHs<0{Vbnl=M~c8Zpy=zo-ni-gh<CqVp|7RAmH87
z?fj;yzN5YHqGl3BHRNzACcx4f$@WEXf!;X;+v&0;VFmZA<)pn}hS0JwZ^ujXL?W5X
zY#ns9*1dBd>Ee^4?f8xT#+*QXuB!I0wy}03eoIxU=bwia>b&Dyj0H$hVwrr`EtbCB
zs<P&Ke4m4mUWM6h=v&%E`1RtfdanUlw?YG9jq@L~3&HL~Bo83h$1sY8s?=qH48e^S
z-pPZ}P#jfKQ;o6s5PBblLXJo*yVW}y^=x1KDFt#=n~R6*YQSWX0VL>Mg5=jw>{vD_
zEVKDXos<UK82(#A^+rQDCnYaf^MyNtV(q4!1YU-KWWi1vjNq$NDuOHM_qt1s-fEUs
zJdUw_4;M4C2^kqYj?w2BRKuWP*7)$Gk59<jP4<1%h`~|s=*w4a%(_WDgBf(Bb-8PS
z^14!4x?xx}q>vBSF_<sFtVP}p+I~)zyqQX1{ng35&a?7i`6qV_f7}o4rqji%f?Rzu
z#d^|-&2<)drBy_;<t`qh$I#dMJJKl;7iWv7A9-5fI#d(<f{_UlTiq_2`Id)YK+!7H
zkPkO8mMJP|c^r8ti!RgHYpRpE7T6l~o`DaeX$S0thuLpeWAml*9R_X`>!oSDd{;k6
z7PYvXYIH%ZM?PXa{OpxMl`PRM(KesX_N$%1zWkx(oiOjPBk|z4zcuy!GkLK${e@_x
zsb{qGLynCE=q0Ql;V+<ofHEDSwldILB=Iyll9`ji_y^AGAHNN2-fau=jU})aM&R*D
zZkV_5mnEvJRp`bTjSDb7O?M+Go*r{gf&yh~kVTow-Z~hKGm!BU8s7+MSl;i{9`qU3
z8J;ZU*GyNaDE)XTOtWYA^SZkQ5^3ejw!suU#nWo_BQ;d4o)KU`Ry?J@mpeFMZ=V#B
zeFkQI$Lc}cU-NsC$*@=3wT-;ifA&~p-(M^(jAu|8X6izPkhDb&A>gUMn)NU|(dM$m
zeKwmKH|CSt88voWTN4_}q)+2L?+qAmzB!s76+@)P$-Wcxkj&!Ga6Fh?_AC=>@nsW;
zk3zLMRjHahnQy7Q+Mn;aNmvp=y*a(wXN`Rf#5Ot;6aJdhX5I4%qEuYc7K2)&Agon)
zOGYA@DW>*hPRo7=xo!N#Vb52=7%&_w+>C<r?e^Z){^MKn*-E__@^Oj;)nku$wr@*j
z5^oo8nGRi>g6j-}M(6WBSN{B_f1Ea3bFa}_Fz(d4_FQFO{J~;IVpzF^L~r~>vi|A1
zR4q_HL%?a0O0i+cV={Xl_9Eatme>K4kyL*%g!S08gyJ`I0<-!0(xa6o8TWcCYv7vM
zb{E2_`Mq{EHWCUxqtZHk4Iz~*K9|>lUX$Z~YmXTYrT6i|+ZvmVJ(7#tU#oc*tLyHv
zL~-pYO8se{anM)FoiJ*U`NzrKWShNzs@w8IAz)Ky*zfx2a9`n*zTopyuhK=}Ta-m;
z)JXhj{x)lUrHfa5rfTINBL^^sXu5+9#J#}CY1fZg!FN+`Di!vdt$-&$IHfsZHF=Jk
zSE4L&I$WLnHb6|!rTl*=d&l5NyLWFp$;7rbu`$uan%K$2wryJz&BV5CcC3lbj%}My
z-+R}ypI!fY?+<S(ACjuBzWTyiz3@AZQyD9&e=lROf%4osZErU8w%&3**zXiH{!QGK
zK#@t#Y||;Ead~{XR40g~;VPtW-tgybu`zl5ejLemF~JXP`Tgb9Z45^DMRMd@lBq>b
ztX++zKdM@XqMa_lVX`q;df^0r{{X$}^mjkZqm;WU#qd&!e<8DGeTqpY9rC&9`1D4x
z?YwVE5n!&qYW&41C|6F)s40|@z%Nma$V&>e_*tY-lKidGp1KU!Sj28Ur@2@bH#q2A
zoTlP~WUo8CfIiAqSmv{@HG38utEK)Z1vIlqz-$9WZhv3crc3_%X;~z*8AmCaY;({B
z231$#b#%hB_`z19Ff!<DK2DYR@|4SJ_rt$Try+aZ`libU_)<}+gI?}%U;K83W;&92
ztXy@$d!Scub?o!bg0|{jRs9-k1<b(Wtn|7`-A3niBc#o{?@7pwd0ce*t5REwh|3|R
zCs&1d_hqz*#oEAq$u*W>bUV+tqtrWbt=Kp7yWfvfIX2)hhP&x>?O^x~Xy0QZ;S<UE
zQu^a&Y;p!>uNgFpIE*IZ_mYhz7G&Z$ooDkU!KHh9y$nW?&3>^2tqgOL9CpN~|6dnC
z$nMGershG}H&l}7ehdZQ`Zv#8Y1RdzT#^W$*LM_a(~qFggLSxT&xED#TFmyF?dJ}j
zxWvL|hY(L^QOq&fdDhgIm&V9{oXCgJJX$$lJ+Tugq;^th2Wo4QD%3|pu6GNylBUCd
z1in2NI@F<~6yd;E@U@BZvPj)g*0^>&IB6vnKH*>OMCq5JvxX)rg$D5ka_oXo(NRz=
z-4>ex>gJFuS>sA6mEIiL7;3S~ncB0!;p;8EW_}3Ys|`ALO&7btSL_W|_oV?=mw-G#
zh?FY`#*w=Mh5V&rtQPjlO3*!d)(La?u2ll-8P#JmZ_ZH!B~#>yY&@51EfZew*Y;KU
z)N(zoo<VApJa{3EAWw1<fA$}vDfHGs**vb&U7B4wJk9o}6J;o&z5P%SQDojyWupex
z-@D#F40NWFIVw!1YdpJcR&Es6?Pe;=?vPY4R~tYbOJ11o+}UW}=+LZvShd2-n{%0N
zLAz4dmVu}0O%jcY+WHrU_znJcVE)t2t<;b-=DFi~7}Cx9<7S4N0m7D{>0$Lc&(F?0
z7nM*G>!ydGErfN15&hp&)bgk<&~&OaHPYknR=rKjG<vAeS%J)g873+(nMzUW5fwZw
z&rACI%<rgIz^RNq|4;A1kk0(rxb94zg+(+L-e+bD*sWj*Y9f)@RwdEi6noe%EzvX!
z=0$!O0mN%cyxD@3e+cwsl>09eB>o?k40?IB;yN1}72<w3k!6RMoVG^(+`yN#26MTr
zQ0<@tx%9sv7S>PzetkC9BG}8-cDof_2XfYYoUClFlM49prkcc^_h=m@H1Q%=XxgDD
zme+(b1__5=4&`OB9GzBQ@vXj)8(^@Aqui{_p~@cS_rt5e!_xA>4eHr#pUiURAh54y
z^9t@}&3UBIWJUVs^&n?21xZk{nlDdU_Vu(?>yq1~(PR>Rl|&}<2@h|!qo7e>QzD5C
z&U7VTbGB`hg;u|;n44dF@@?pwIdGA2b;+yM{_q4p5?#P__YbYLJg~juYVl(o_!&>w
zb}JRwH#lBBzT&*B_n^kCO*6&+8Fcq<0XR;<qF0s2s(N*?SVi|__H}>ux=>ec-}&MP
zm1G^<@=IGuaQ54C)MtW?=S|x88*B!$k%Vx8764<z6;qd1Qch&HmsWUyfB9oQWHBG8
ziIl%DOdpnTIf}CK8_b)(%zUd$Or&#qo)y#l?)(b`^eE7&)fB+jVJWv<dmr2oELQ*7
zDWXfQ-7Xqlg*nh4*QK?bHD&2gE|ey}T&Ssp&;Jrn<trYSsidCUnAvs2D|Ky!k0Ako
zR2XeFe6N&g)!k6(54Z>x|4UD+Ts9Ut{g{&l{QOO1wGJD~H2PYkt}K~jxVt~{q=0XC
zIF*wuC+KIs6Y6@JzAsa#k`q@blB8H<9~Qk*=-DZaqT4bxY&>!G8l8SPL9i&D@CXE+
zd@Sm79fvUHsog9_c7?S}Ht6s4(+fbjobhYWsrdp4N0Z5G=r#?u-*PA<?-G4p7RWpq
zML7#lX@6EGR(`q_ovl8X08|?cTkod@{Op%_T}TT16~<Dx%%?Kd{Hy5;m2Oc~>yEnK
za$7aOTbJMA^9|KXKuYC!3wya)d?j%?F<k_@k{DdjOC?Y)-MnZCWHgO8YARUr`WV!}
zDYt3ys|>9;?<LSR5gfH$NB8|*cg@<(=Jk#LS*@g?)p%irysr#p8l(Jd3MA2{@Xm&?
z8za>ZBzuXy&j*ygs~o<z-?~D0@&f!)eEnGPugdXH5lOhEku-R4cf-r(N-`d!K&T$a
zWfQ1_t+k4d@Hk1(bZl1gg%BSV=4rBLMmk+;LFQ)YAG3}$5_2`#(<##|NM~IPfN&gH
z%Jb`}dc=+P8_>bp6~wc>IYp)A3QGQGnET#T=yr+pj<2JL6_IQZffp9UEdv@I*>=Ed
zVr!NE?zz%158VH1r}hAvyy-NF!@Z}!d-Xyzi-Ld(+9%u1HWlek%~_4_eOjRTqN;Mf
z2Wy#P9^f3m*3ZRt96K*6socsmj2R=96<)e`!K&!H@d+kQ9+7)6%w!-afMms(pgd<I
z_07~l8Y^o9ey0I5aw9v~;Xy5gd%hhe*%}QCkvgP@46`4DvCZkey>NC?V2~f22EG%E
zZ#>V#da<?cJkjHNr!2m<_6^h<HTzzP8Et0YlsQ4^V{~G<;7TXa+XoqIlu_PdGqC)6
zs$M4nVA^li^=sht{X&SRH@C|mo8du`QFpa7PrdP(Pn|a8f<?fALv?3+NC$hv*@fLg
zV5h<j9)r{<*9t-rZXI5ZYI;xXa#&ZTM}|GQs61Ag7?a+0$4{dS6A?{2n&`9w#V}6L
zjmxhyzQ}A}QDh<K^$wRd+iGR>w<2HoVyMn2m>PKuw4(eHv4>j3@}7sU)yfiYR@pi6
z6;SUI8M+x?h1*BNj7&CJfD8Y4U3z`xn;c7g?!Bcp?+T(S_y4Jc$$X}n&9)RQfSt(s
zM6}#6M{Sy)B%q&CG}cY2@53V!TZ*-EY7d|l+hj0j2tl1nH;o#1xS0_ZO{!XM(Vv5o
zsW7YJ$+%CyC-y<?8E)fyhOY?CD4?X6j6%F_9@o*e?tYP;)Gm!sqR=ulpoG5WwA0WP
z{$l}`NLtI%&$78=N;e9j%8<?DsZ;em;e>*g-gNr!RAnwxiel!+^Dy%_(;b?EWxm3$
z0a7meO_pN_XZK8$icg>kGXRvS-&tGp>{NH_&p#SWQTfZ$G|%sKlpDL+TG_wlU%L?e
z-r)v!!U#Hhp=TRf3`T?>-cY;@9ESWh%#sEB!?(vJ@u{Mvr^&flS=_g|_7~P3^u&kz
z+h6kdoDUgocoUVYP&*z;41rz0w`jA#-w{CEF<hJ5emvc5^~gB^m+dgQF_bLBNGfH8
zQneva<(WtTGyMZg=3tbhI~1QyXFAUIKt~n&VER>p)^^xhl^YJ~8$<4#IV_Q_0Yuw#
zINZELoOQKmM^u-l+D)gPPLe039KlMPaRv9>f~#Sk6<k>B#ViWO(|Q|$(K+W>Fi*kb
zoYg#pqdGK|5aP#S`KD>kKJZ1U*)aHJRwJBN8#^TFzVqcie{cb2N1TJH)Bw?~(l5=m
zx+%s_>&HNbKpK`kP7}6|L^z%Oi)TZ7&$Euifo^lmRefLUs5Lc_65QtJKAkG-2jO;k
zh5Q8mP{fWW5iK;{H=ES!1dYyBZ8aC2_yEwhnN-qN8Vl6Qz4ekUt5NB7(wMppP@|Vc
zVXOtBxoUg0IuW|ou2ByRjToZ)=Vuu0TCQXr4q#f-aRNj?S3Y0x_Ws@j`8-lL>nWpL
zzdPBfezXagk?4Dza=UuaCMIu|sT$TFCZzFU(Q8ova#7BsO8PU%RCEU2Hz%tYyUuu`
z28mvCBJM+O4Re}%A)J)W<${Z1DZ^t4m5Zt$7ZDacdz%~WQL=CR)b+o8z0UE;ZR7x6
z@2sBiH*d2I=w(kMd;ZwhZnAJ`#{9_1LV$Q`4yLT7R2Yagt>R(OU1%dP%rbiYK4U0V
z?h2PBBz=Y2)7haJ^%&!Xk5Q?y7)BZ(l@&|m&yorK%0SZgRBE(-2){RWYH$xm!gEFb
z3%QW%vHVX?GzXECPf|9wH7AoX<cAN&ad?80sWd@O^rD}2Bqs#r_sWCLCgqmN{co_a
zPgj|p&Rll_B+S{nzQpfD75-n0YONYmGz2yd^Azj+KL+4VbJn^%fz5(;J`$lkiP0^h
zgyBIeCA~}cSD9zpjzVe4s_#eDpLR(M`rkl+IH%UC%Q5|em7rpMSL&O;nggwRQ?bc>
z-X6A~2Fr@fnupTTeoOUn+4$L<aW^r#lg!S5n~5#aBj-h{{NZpU20n>Eqse$;KTVMT
z$FT$v_}wq1;(gzB=WZ-%knBJp-p491?W>HNR4UsaB^PpAwe~f~p~`i~Oo362<NBLJ
zyU4Fq6IS7V%M4oefI`FuJ6X)))K(%Vl{W5V=d)TL@{QlM?f4R}k`L(2<MHdOhneEm
z-4#VO+q#3!T$NSBYALeYBhB6Z32r=lcxqFg@R;!_dLKFRF1S2*{D3R8b#woB4Yx-+
zg?;n&I6b)pz0C;`2}w$Yk9xj}QJhy4qLKJci+Jb?ZTtNd^PH42gr(HH4{aBtia>q=
zc2|$SXv*f-+>+BBt*`DPSP`ExX<C_=IC%B@(|%@0?~B=iEtb)6+iUk7XSd;xzv99X
zn>Q)Tl#9Myj5ON<?GhtF-<8PJ-fi1*Ipe=ZZ?<9mBjM&DymDV0!m*0(<@cCsj#_$$
zA4s%6*_oUT!2T#wC-+7$+!487u8>D{>y;N^oQ_nobU$`~JcQ3U`*1iH<MAJy!OTkI
zq=3?SPwtngS%)+uL_S0!EXVylAHEYjVFG>91A4VFBQgd}u)QHvsHZ-a;cI?ZgGhd;
zSI)`NczSFdABzO39P>z%_uFh0Q5)Yu;efrq!Ki}+X!EyV806`%L`4h!i|8EslLt4r
zm@(OqS@utP+I3bjTK@d@i@KV%9>3`8<ERn6+X6QEttGbvB|an1#=Z}Ln0O0wzl9Ty
z<{H&nwSGk>J`EZa*pM~1H@cUth*u6Dv=E`TAm*U%&yR4qyug&YO&8ucEfVqKH}|^;
zpDRP{ETxvJv<1Yhu+f+O8APImG&jaTAec8v)}o(%ie`O?U(wKSW$ObS#4M0$BFGII
zaOvUY%^!BF0Fl}T5IwBsvGJ>^)M>u^8g+X265UH7t_ZIsfSOXH@zvog*eO=&*bFC2
zWMZemGLnE%FWFkT!o`KEmv>6*%MQ2InZ~Qde9SO@o8N7+5>=l9DkO%SW{@FUEo?E_
z3>i8==5&8S9irvuHAG+%R@m>0wJz0P$BI8;FK&5V`_K?EVi^KM=Nv)HE6k>J82frn
zvB8DPpCbxk_mmG8%S=-^D5{VkLc8}V-TFq)8{<OhlTz}xWEJuwFu%>=>wNyms^h~N
ztuc3+Zn%1X%KBFJxk)6le%QA<jpLhoYhcc7t*148>vfTvf}l_!2_{U3D~$zQ)8F8&
z{`uJM&&0iQ$tc-2XWpM^*QKV(tk%fSVpzVC)EDNch?>K`b+yQjkHrFKYt0K92z^8-
zCZMI=XjRg%G||yxhx?rQ$4wbpy(F$Nbu9D261crP`-OTa<&P?@*|sG`XUK#B0}t8(
zg2}QuGZuko#b!lh;fISiAVcSKi|^^B)Abv7T7yq#vaW^xZOvwj<XLN8^lW#s*vD*M
z>s1x5%saP;8!e2Co?O;s<pr=!H$HU1`04A9%n#DC{;;^q#r8LmGDYil1e~cd|7rX2
z+U*z~LL#-_31$o&@O~N_eK^Gp8u!zlp9OZ_blvloqML(y%xcO8a0tRg0u=&$2~=`+
z&7W)Snz~fu$viO|9LYnt@<^vL3PFCPg<4f%2kwgRc19jA`rRL7+<ld$y1I{dWuG=P
z1=)RvU#qSCy90_WMWU@>RjAtOzR1veNeH?Pv0u+$)=x3NedY8hi5#E*L$cUGw9$0E
z@ve94E~lvQ&i?uY(0wdc2y;)u<UwE;kJbp1(jZM)utf(o_?usGlUJSaONxu7!vZ{Y
zG%s-@ce*6Ld_Jvu9d=dEU+r*Zg=?-yTCd}kB<2*;VCj0P<ILi*d;mHJxXhOQ;eHhQ
zr8x~Cw@8Aty=nq@a9C;z{b9oN(LeNP8dS6KlHHKWQ<AzKXRqFz{Fzo31t`wheRQVJ
zm8sBV+YeD4{qp;?@~{SEF-S9PP8y0zX$yk7A=uR5zCGaTLNe&AHy)OsJ3<qDGi&29
z>j(%>Do>(I8!=%4nlFqd@rA^8^&iiL3gm)znW<B!jncYojQ{HWgs-GzK{$<=DKNcn
z*~Ei#eAMfV(U(PJE(t;JskCIt(9i$io6NqiCXX7Snk}5cvvf^nO{6ZAuhP1CPEm<o
zz%=8#6m<BF_oCBgbR=Vbv_O0=NSuxO#2pehfL*1PG2%;i!f~~LyfjcqPjd+o8ub^G
zS{<@WhbY;^qUEybMTlh^!7Qp;>Co14<)v39pZS{A#sEEl1mEZES%~_Or!;F3lzhz~
z5fBi@4|^s%%TqgW_{FcP%hod(;(J&p3T!?YyEaegR-_-=Ew1YARv>E19+V)pzfjFl
zc>39Fw{mvXe;<xaOmXFoFp@+b?uZmvGvnY6j_~BSN>%<e)6hea_?e#t_SPq?6d1{t
zi+LrU3mJH0y?(M^Z?1mi+f^xF3Q;t1Q*&5a)2wSn#5tf5f<=$v`RG|_D|z`)9=&2?
zCPK_n+hMYhSI-Q%s^?31(xfdFg_l4+wlsSSMYnW#d@x%!bUW7;3oD#R9eFD5AFmE~
zk)$wdXoiH3#CE06SV?C^xQ;ZIa7jIM^yIKDbt*jqk-hVVQK=?S3DpLROD{>k@Jz{f
zXtc}PV@Rkfo=i0CcRQwjeJZ6aqAp`;38eNw+|ZbYy%|VeEXTd3-3$#-UGch8*{JE;
z?+h{Q6hOOkxxyYA-()(eTCLV=>6}jL`@hjubm@@}j)l=_+QR5oQurTZ1&Y{DRd<`d
zHgB5l<q{)*<#(J+=dCGY3vO@0nEy&6c|NxCwcdd<8|fCN%Wc6)m|rNH<@z_Jm~;}y
zejFS-%bwRr5T4nh?&)_-;WihICe=7r;h8Ps^UD4>UpT?vp)!fd+aP}h2z<jr+$36T
zX-LB&?yt`KE!t+cQi#EC#ZR))ogtduUlEQBPVCsP@=WP@z`c885K@fLAYVWfH@`A*
zYNZYEyDd)}#2emX!|u{^J{b@phjwF281s-`ZNE&B0WokBvU90W_@j=n7fIRGr-WQo
zExi<6sUrmEHrttH$|XLAE1kQ{yHRtxc%)3SANBi(NDzZGYtTLSU}m1;z9BYhV^ZUg
zw(AI9t~Q2L>)&ABpK?n4)VsmI6oK}>sbDPciJ;YviFjOD-#<Y)9jT7kyis0jWhwg?
z%dVZPVufV&7}}vy5&QPZB8R8HXwR0rRL4O7+`aKX>&08QiAx#f&K3*pF$_A5%QVO(
zvKeL@IaQHNhBkX^Tm!XMG`MwweErn=x1(2qQl#<*RN5*?s{Ggo-TrUEHKj|lB`TCa
z@yIHDaqFehuT5E*IekA{O|EyuH3NQ%zQ)06R_1XwSue2khg)~)s#ks;sdnP>Tf?<n
z%4?YSFR6nIlF~gGe;XYakH(7>#Alx#Z-(oRLPgB-(Ke3{+L>Em#h8xrf7$_$N^Llt
zJgGhAvzw~DIVi8S8SXdo+<{}3;m9<oUB+XzDcu6E^_plIVke+ld5VW{>`G>Sm4#&b
zF#MLzZw0XdGP_Yh+HDz&$29sHx;ttqo)0NL`Dm$M)t)SLW+<h5d;t?cU~OP{^XSp4
z9qgDGkKXAQk7gg0q9NEu#Kl0xgDcVFL!)ozO7WsW>^hKJ*e)L2U|@WQLE=1zPGE?^
z<&dT->w^w@SzsD>*YR*&QppMi26;A^OQl?_7dbVZP|2_144rU*cB~EF2Bzh?PtrOY
z)%J}PY?(``b1J^obja5#H<}!cADu6VVE3mo_7*CqhQR*fe<Mw;{OcD~@V8Zcp;S-q
zAA=)E+9%U4T?zr2;GoG?t$n=x?Fn{oW;<8$FVe}1ntpc2DDblJ3SK1G>~d_`@V!X2
zz^eY?F<806PlavnIuWiQ#tEHynjbnAQ&z-)q7ub|#M9$Z>vebXRPo0i^G{ix#!`Fa
zZL#j*t(j&&ktyMV{oN1Uaho>S-`v{rc${|Wo`2k9>B)cw4ZBmD`vn4dAbVc89CDCP
z4^PeJ@UCjztQh_hgK>Via;a;{h8e&01E1M!xMhy7k=f&*yzDoK92-<wc8JqLrC<@D
zx!-+3xZ(7nOJCdH--oIrx5-+PlUV|q$mLU4hPx%6Ol({@ld0U{X+BT_WQKzv$z}1Z
zw(CUu@Jnvq27|U3(@7zHsVM3mPJ)F7gcw)lT5jg(AGS(jO&DDU?Zyiuvw7PEn!A-X
zofhi2&t)|UK1V;bceV!-R^hR{5})P9W<0%hfGrN*t&%>Dy31OYR9is*o<evA$%zGM
z2PJw_C?rIo%Z(p;lszlS=NfP<J~RK++5GLifweip%l9r4@pxrME$)=l($c5Ql1ijs
zLk5VqXg54LkMU2Q-`1nVVKQU=HEz~zQPsQ?`CPD|3WbPGjB#&Q>e;Hv#i%I8Ox!wC
zU6eYj*YWh68Q^t9*nQjlPA~59i1_Nek*FWnTVSFg*r{+-IfuHYF7C&vlH~F-PRun=
zgGq24{WqdTF6CpkQr^?IJA+)+n%zM@z28l$YRcRFf~7yh5zG#6)#cpWjC#1a<5aKH
z1xncFzELgMY?X6zBm^y3cD-(Aw1SOYbS()yHl<2O)NiF#jpwA(X{`||o>P~K+wY7d
z%a(T@0-Yo_w-AuGxKT?cm^aL@ME`wLe^90Slq75I;y2x)!2--~Sg7;ik+&X#F?$GK
zFDlq81J1XNyEz8zI%C1@lcr_Jx|;As?&{>rQiZc9tHAG);T+oJWHLz%c?pfNwOKo~
zHTSCFjz@nDE$6y|xnabyuA<HrphcU<9R_P)2b3|^U;UEuhkxU0_${Nb#Y&<(+!DTA
zJFqo2=ku6^_Bg?*{hI5_WH6OUQA7H~%*u03_v>9T@G)9fyR|U)kNA$_*joesM&d=D
znn?DezrWYbBgilncgGCry%xpTACb*(ry`8EOs6d0R->uIA<hdiIir@xnX(4(R<DuS
zj}Tn*g5#RX#qiy??Q1wr3bQ3>k*-`TKrcCk!#D8?E;rjygO7N81AY_(#rbvn&Vrg{
zvn9;O3nPWu^q5uvkxQ-=MSJ{&P}2y>+HoP7{FEn}x)yG0?Do^;dxNic?G#$&Dr>*9
zG|__*l5MkK>^-QpZ*xNx8NuIwJvrf-T-tB`{I9fbnkv0xAq8qb8b~I?X?;}0^=Ct6
z)9vwM!}*Fe{FTqFu*t0co{Crl|6;Rq%yfE>xr?F1wRoA*xn!)e=(Mrhb?gbNA*`P#
z2bP}jDckvB#v=6gz$=w+FiEGwPt`~e4QP0S_FAi6qr3Q_%@;1kc^0SN<)q}R&5vbA
zavA%;W?v>K@(OtOek--n`t0<@L_U38h-dXJu0(!4<ElRxwasXSwwWHT52GT_bWZbZ
z*JD_?FfcQc`JGdO&R^vl5-teAGi;kRS0L(4^y@Ld@lrrhznEF(tyKZvG|`n%<}QI?
ziH@9dOf}i;(j8NOHuUQS90nf|lkor%vH4jG&K|u|Ux{s{;$<>Zswv4{-aWc0eQKJ^
zp9g}L>JA&2JRK%{s728~A?HG$J8-H72Funik<7MVT#u96J?m>K)Oc7=oo&w9+Hbfm
z7W$j4yJn=qm^4cG8yjmnP>Ako``g|!gA0&#x;-aM@m9hgi!Sm$qj-?4kFN$&PV=*S
zP!8Fw(C{}pS9I~Vbk<-PAHiDnfh|gfbL}Vw%bgpz#Lw^afHxz3P!`(s6lVL@!Bzgp
z1NqbwcX!R3`%R%Eb(lah{q?+pv+G8|nS_?Y2@Ls@j(fIBFVMp==lk_#cRY_OUCJY`
z{jO2#7AdNyeQ$5?m?_gtS|0_<WwXzw27w4#tKB=aair=G^K@5iLiZwuA3D{s+{3}P
zG;C|M>}S+ajslw>8j^pBG`oz7oKeHi^TjqIuuXULvXtg^wOF=8c}>ZPEovvDL3>k-
zipb$5q_NR11Bu^#%7ALwO4(}EgR9|gJ5gk4htJB%i>^A1zMah#9%qj#kroQUr`52z
z35W8z%VSqsrOedo+hn_Xp_3;Cei6<|0v9MY5Q_*U%xyhUW$$COlY_n+IAUaz+&&K7
zZ;ftPYW-MGhu*!D?xeFPOr1}t1RKK|^iKBGhw^E-**62f2xMOIew{yRpI$r}<Rc!A
zd;?{A&y!Iqf60HikN^6}GevKTZ#L^ij*I%|!6IfdsF8I^AvtU%k<Pq;jav#MQ5)-9
z(I2JEnpjPzTuX839UadK8D8NAICmMqZdGURyaAv#SwL2R<LyIfk+xQa-Q6Sor&&%B
z;EKw&8S9_9v$@H@O0TCwdqD!h(#hc}+t+sM^INxL9ly!LJY<U4i~&#gR$$jW|DCTc
zEjn5o^O6{UFRG>bPBo%K^}Fxvv!3my1d4*v@k#mzfW&iXO0%*JKJyW1?|o9T5*}0O
zTDXwW*w#UOfpNS*i606HpcESWL5YQz9Et7l#IyFYr0;ZkAm<vEBV~TeX!*(mCc`@(
z_ERQY@PEN6J^HXeicRyo_eNBCKy`fn_VO#YiyU34s&BUnvcUFi>iJ}Kvb%4HY*Tsc
z#%+$)k%t#XXhmpYT!aw(<^}OyZ`ndFnp7|RuvEVx-vU6mzZUGDGTFP}Z>H$~4@&5v
zhUhJny@Nyi^53|?6A~zi|6WX!ii(bUnU#!&{jZt*-$IK2VCJP!Icfiz{Rh7U$-$5K
z>BIhAA&~oz-CYq!kWJxVS@jLjab~>L5T*Y06~b~L#2}Z+&n)|2g$Dp?klp>U5;y)|
zUlEiJx<<mfZ{vLbdd?mSAiMjeCM(6iz5<d1bp3+cpVB=4mFl?479zY19$Nb?6;Bb_
zmS#NpzwQPC#66DA0~u!U|L_-}KOlJnHDHj50VHRrss8`^wy-}TEawU%n+u2ki)1(5
zc}Bq53pE+ZN8(}r552|zLP-K8LHtJ3Jd+*(H^Ow7{r@o||G^>%HU0yBX*u2+{U?0#
zKW~QG`j6^it=@R_AK38!^OscXA6a}=t0(KfpN0;IoPoAmVcq}t16BRQ->9tt{@cy|
zAD{HU{)J=$-G(nMXQ@vAdY}lke^?-TJH6<C`LF?ypqd{^k%8K7vfTfAzkksGL#|+P
zG5@=2Ay5YNaP~0r7H9tJf%20;2b%enf%Bit`~M*DfjyL<cCwXBr1I}o5N4!72eKHP
zi26@z?|-h4&=xxEchj9t@5eomireXCudJttI9SE+YU^tnuYVw8coa7CpU&5Zm1g6Y
zQBVeD|LGcol|Mr%x%>6;a%_)Qh1fdFvASO=S(BOkq2<ma%?3J(G*BjmY2@Pz%-&=c
zFIx2W_8dq*PWAe5nF5mI^ZT&aOvMmz?d{PnO{wyEqY(1ZfHuk9@nBGX2XH~Yff#W)
zo=&)1P<(mpQLyr!s+4j31r(p2qVoR=vSBozM^z_faM_Q7(pG!($8$5TBe{S{P}X=9
z<f~sl@Ll<ZRd0MK-(pDSHJ#xUE4dzH;pLI-4MD*5&d|!NNQ@Hr_r!gBB~0Ko6aj~r
zmONtj;T*`c3z{h(o|xP*Dr7#XgIM=dI?%Q=N;}^ln4oweAzBO^1}z3nr@i={qEX2%
zk<cQOG$-*o+yv>1Vb{~WrA<GCWrp|bL$%}emK?_ScWC5zeNOv2+t*ikz0^EJxg6T4
z#4^Q^(>=XzKiv)cb#cA1%!T=ho1}^#n$$C=Kb|#vbUPmZ-Xzhl7sI1$3A{u9oi!b^
z-Bu+2|I2*fWg~?~rTk7&E{-2dVFt!-f3R8nl`s=bPR`}$uvz;tYd%T#&&;)K*8I2S
zTz5Tk#(o`0UT&7<6;CBfHIh^lU^12@fql}jsCU4uLxWdul0Zhe0is8oW!A%DN`E%O
z47d^m3bjoE(cdEI>D4OdQ4CZ%V@*Nk@SEo5TUP?kBZNb-nyIMD01vqH+0GK76)ZgT
z^{K<ZlFlvzb=t4ISsjn4G&njV5lT6K<4(C+dEO*nJ88}JPcO(RBd9wlYaM3Do&~*^
zz^TM$8WAv~OKD1J$TY$K&~Tf>#V731a35w0Q^IV2kpjtADGo{DO9Ldug(0EDenS3$
zdVj2Gx$sKWb#ph1>j>_wXg9Yv<Cy~TuP=apKxRc`d~yB)5^ye`6#gY40CwMaI$OBY
z{+L4Ouaw+zh6;=MLcI5v(;Brqrgd-FIOvaw57G3nMcZY`XLzhX?4*}qg65Z^YgaCT
zqp**5{IAo_mUAi>Z%{O*=`MWwB(fSFEPG)j?1rCjw+c}tp*z2o-^565LLyUtY>QSZ
zb>iEu*JfjPaY;~o%4q}8O04WbdAhp~<#(FPA4Nu{<ysS3WT0n7Kvz=iXtjX)e4De~
zw4aBk_7OokhlyyRYV)_>^S+Vf@xXphMzc*GcyKtR@8B?re<mo(_+y>NJ>CHoR3U6J
z>w)Nr$Aa(vCuAQ{V61M(tpDeS#H+xjTuC=KNZRa{+kDQeu`2WK@u%47K0WcI?Ql;O
zDHv`7o27==%};tH+au^qWl3BvV@}M>ti^U!s3ex8XytM;cjFyKV|Dkb>tx<uC{FNC
zV%OSznvb{c_sxf_j%m<SM41EXCRB!*)-B#rXIu{UxbBW0s$34vH1WTiShR{(@5g~d
zHs+uppByj(ON~O|8!a}x6Y0%kw*oJn^oepVZ{TNlXh9sXB4p+hllPthU=fA^@q*U?
z6^VFS|B2>G3Ww=KB&*%6AwGc91E^*{ZO+^5`A+Py1r8T{m%(l?3|eU_G=@Q$uEd+O
z9BurZ;eBJ<8<>Nl-@Q_@pG~&aws=2D>O-m4>53M3c{0Wqw5+GN1l_Owo+kB8zdOlz
z3ODM`xDwcq7?x`Wdm=_n!1w#3)lkwSTAfnVBvAC1N-KqF@Z+Sq`ynW)C;QD&xPcsm
zeKcB(yyjyCF;bFNZi(DNqWe_xkrMWi6CX8|FId7<v@vTmD!$FoLKYBImh%Pi!dPUI
zR>|wO|2nCU46btei=yQ=ZT1g^($t4GJ~i;@{{FHhOq=2BjjX2k`=j|B_%2qrHvVc%
zK!4S1zy5**X)8M*!L8kFDf6I;PhX!ykg&1mUOf65hsPCIYx`wcL=1@9Xj8sW&KS4s
zgye70CG3srDdK$irXV;Is_&Nvq8F0gF4u%X#TK|`h-x%wUygQ@*nO@HMnU+*oypA}
zyU*XP2&uQ&t<>*S#1asQcmhFGa@it>Q=b%1Qy@q62BNL|!5R0@A}=J(GSN=e8rAT{
z+Y>@|YIsaqs%@`_6Gq#Cp1^hq!g>7!vx&||eYuX*+YP=E1j83(T-<%z0ubkx2$)e5
zwlJ2{YD@&2a#)=$TwL=TQ`Uh5i#u_xy{nm?B#nX$$y>bQ9gtMg*D&BVTv9oljQ7V2
z3Ceu$?Cb8EzCXk9q^JKBNbJ8%v`|~#zPQfkTdH_+ej*d|$j0Ea=Pr==|DaVaWSOjw
zxsK}eF0(F?zY`gXh74=cNt^u{J3JwE2_WCDC0lRv^~Ps4fqnFyQj8BLDR$Z){j~}5
zr|x_0kD3Mr)QLQ+(}J@0iJWrybh`PF=dUO9^vs<~kO4y9?{CdAYj06J!VTVe-|G0=
zOjev`yG1K@Ut$Ox+{Bj8Yt7cRv<89Nz2T(HclV51E%(3TuC3;Q1&mx46{Ji<-=a|-
z&VS1r!!(Zt)`;j7UQ<58%?S4nnKOJC>xgILmP~kDQt7|Gf)xk@EDSd<a2RY=N&-D>
zuKxBc$|t1yhZHwq1Ri2B+WS!_TC-fX*Jb1{*Sls|c<g57{c*W?<F29T1(Mpc2?m5&
z(YQR1Vx=($*o)&&Xf;1*Rl2E6^{!TLtMo8uJiH$qvEHp8&-DrJcCtta3*;*H^LASQ
zoNZ{q^7p|wzz&-l=FRiPg{TR8PVk*PXk=U;7{s~`izB3rTJM2=HqhfRJz*~+nV&hT
z+pVNoZi9R~5BXNL**bHv_d*O{Tm8K)#yY$LiifKE0BQsUp2s?03z0+vAwuX648}Z(
zjbY!8fW|}V65Z-5<}@&)!GONVwJY6+N4Np&uj8PHTj`avstlbOGcXP!r*Wshh~2Z#
zgFZ4Rb;|!b6o&?6X42~h1t2AH@3hm{IwgsKO+^wUzKQ#UP-*_Bl5@pVsQ$3zSsv{M
zrhM=3kKbB+xvkZ<VL(b+DeeaIk_NzcM5lffX`zqsbvl_gz^AwQFUMJHKJkBg?u81q
zJZ?5~#dt~DCnU2ix65Z8D!n-Vhh<%*(CH10Arkcx<nuD`6ef1hlb@@Nru4cm<VL)!
zSZ2_q3_ck!scu!;Qr|-^XA8m;yJbp{6In%}>6BYpknq?8vSre~9eE3@Kt(Hdw!4=4
zZ*DZ^G_~O0{J<z5p4KYUtr4J<cd4-`n6FZmY3ysZ0}{_OTZEmZajQ@%0XJ%|4|1DA
zVa`-ax(e3XoM2+?HvH!fs@P4oz(z;4FIiAQ;A0Z+Hs^eaIf_=BRnsY>C{^b^y_(`<
zbyk^8n({1O(8&8`_h-Xski*dr;aWWCPrvbbj!ONURp*izw?>B<^wHk}{BmjE1G5dz
zvlWlnOJGI^f&=m-h&_KmhpKF5amru6Xf@qflfC&r<%Jj8;xM3otqdjNYTPB{%az0D
zQVYS_WS#TdKf#jDX23(*Q5Bdzb+wwy{dnIa4l=NyH}ifr#eZMS*nt*@k4F027-c#p
zn4itxu2PlG;qlt5Q)Ui_+9$}O+oWHk@Z>j|#7GXTI_0pSpe4N=PN3S4lqAP~BE)FP
z(Etlh8W5~`m+B8kPUgg_ocN0VmT}|muytBv39?211nIqdDdtDQFoixtDIq{CB?{7k
z0IPYdjoP!IA%S!*LmJzd>qwC2Y{}o?bN%(zrjTb@m&;C0^?Yw`x!=BIv4^f}?giaL
zba%lh0yL{1Bk}yg8kv3ADcj_mg@-Cd`Fu5ipsk;k7D^ZT9e?>h3?X8wJZtvGE!&P`
z$iN``%5u1NL$dq>xMM)w1l1wSx^k%yxYy}~l1or1j7FoAOlP}$K_zqNz~)ahJeav+
zdGZ4Yzo)N8-^^#Z-9PwrBN{Cuz6om=!74E1mBs&g#<WQYJ6A5L`vNfNH=8)R`a{m{
zaz$ExqeW@8S=KOe&|DojWDc8g!eJx(c>@7A^fz-&!R2sJNPiCQ<6C$SQaac2eqd-)
z*!}7C&Wq6rmqjh4yEJg#l9ZSiG}!DzSZ=vz+zOSuBI$+oH4UbcGY!PO{P}s`&K?6L
z+CVNRE)+$85QW0$kkc4dk|Xt1OZtJ``~H~Tv&9}5=C0>b90_QJ{}^Tzu7JIT8aN+0
zxA+WvpO0}L2w0nMPyurIj2=1emQ4`YUqPt);tM*WCwVqOV1fKACpY2Fg340@Xg(UH
z+gTfI^LLr60yZh#ljo=%A-Sq3erP`Am}Ej*3oQOG;lQ;m;V-7WNWj^^A_9@O5r*#r
zxIAum+K`6!&(cX^N`?cqUw@``H?i(gPiIdFrtBFMOm^*K#cmPQ=ydH+b!hkHo*PKa
zV@T!vZud+)9C~t#Mwez0z=L2M4}Oqfg5B}{qEKrnIuOJiVv`yrOo|;Xn}tNk7h=Vh
zvtH-hVbFWYBM2Hrc1{O+FkbyAjYGtnz2#cb#x!N2e*S$(iVBa<<9xu770FKjWt$Nj
z#I<*Ts%B-(qux=<dZ@79-i&D$Z3f225M(nc;weEt34=v+fdySY9!!-WWpJrrW#<I0
zT?NC!Y&&m$e6bYT6j+qUYo#AFhSZ=rgc~NQSn|sDyYNH-Cbpm7G@u5V+P6sHFufTi
z4Hpi;3`PJ4XfGq??%d5s*$`-CQBl}5c82JYchpYrU$U$Dy|h09?W%R!=&m(hXW;uF
z*c^eamJNrXYBx^Zd%-9SpP<@lUa3+@C+1MdPW@lPOGNhR)FOuDGCWt^jwq)7GQUTq
zF?OfaAKtO_Q%)Vcw~@!26Q31DujOfO6L8sYTW*xc+!LM+6%K1hRv7?{m@OelL>tVe
zuJ`^Xs1s`%kbhV&R<mtn3%tX4EQ_bzM`wTlD-Y>zt==)l=`Yrl(&^*5UQpmnj9MMu
zI5iQDcZPj?v}*jR>H}PS2Ga(&H0q5cb9QNEXf_~yaVS=?&Y*NQau$#F)cCXqJSO9T
zVHK_;+j*alibF`Zy?`2o09P>*36IZ@)o=FTwIa365ru=RG&@7GA&m*W{b6Iy{X=s6
zl%t9FxocsWtAxPGcy~VU2N;aO{#D#~WI|a_28GVZTi9IYVfO&X$rMie6}t0KjvL?3
zLdNt<uxyK{r!)blVQOAVrY_!S%`g_|J|w}rq{>iZfL$AzZ-{YBlJ?uYFdA%%-$pdX
z0Z?dy`~GqZZq+q<%N6zGWTAri>iZUOP5E6srnBp`MLh1>K(+zni4@x>Vs7=G0MNyy
zM|gF%{+I;A7dy8Mu7|pL(z2P|7!M-|$%Rd-VW-r2%_Y##oI-8i+egeR7~<~OuTV#o
zmKZ~Qp!R36TKn>NwKIi}x5)N{9W~DO$z=<|sLFoi@~F=EhVY2wK(c3T<1nb34M6uK
z24F)Ft)fs2?@5FDLwp5jHSaCmNBMxTGxIa>glq->v^zxG$l?I*G_C)4Lxvy15%nlh
zvjwI}OpUK_-6C=Tk|YB%6z80+UQ6lc@lKE-vyqHX&bXcT@c{Twe)A}>&gAnjs~H3$
zKCkXkZ8SR>&lIzR4QZc0%atek)1&pI`agq<NPcM#PwG}a>|xRLeA&roW)1NJ_p$2+
z4;~naRH)W!NQUu6fpugl`694odCgDiYk&@Fn|X1+KvTub8(%>mg|H<EW=KE)$q+;%
zYcz%f+kM<axcQsLUxrP7561blG>lzuHdmt?<L#CKR^LSJY~<^6(*d?FsJp>p!q$2)
z-4nu(u*|34k6>uA@!-~pz~9Iwc+%H6;P$FvENJC@eqd_B+TAnpbk~a&9f7gLanrJ+
zDbykvUrhVT<$qx+Y$^jXx$UZhuvzf6sJs50*L4NfIdOVl=rA9e%3$A0iw#7kKi}>*
zz2(XUZ3=G?>z13q%{YWK?}6&Nlz}8{KeOxluys%yNxnR5!a=u|jaT4)db?BEYw{~h
z_XHP@0ah9yLLS_pi6g#voz+EPW&G4J6CG*u5xiB?XbHK?RmGrNX@YII9z|ha%9mEO
zgfSGu0Ae8Nwt2-UQ+v<yop69btPnOroXzC&-(y*Caayk}JIR9tZ<U!Ds%joRZkd*+
z9%0y6YLM%*s?}r4`(!WJHry-&LHsM_>j~<$NJM-tlI%ZfrC~;UVSX;FW6QM)*<M_2
zlcSrQg}_mjrb_k%*Qljo3EA&3Kxo|C{srzCd;?D}ZxL0YR>+61QO&F5b{^pZe~siQ
zS4~<tE|jvjy6Z>YO;4hSr9WhWM6peru=!kAHfJ{tNd6t&4Ui$dznvm5@1%o>g<Xqz
z2QYu4R{CE~@KR4Nwb;1LjrWLFTIv@%e2pS;hWrW-?o`J~G}JJ=KdKU{KBnZz`KVI-
zQ);F7fZdJIG=>{s-0osom5ERnW-L3q6s-i}r~n>$`C`BRrb5e6@y6|GBV*S}dDh7c
zpt?bB!8l0+2jB*bap*Jxq?2eS4_rCn;{b+pW-||A#cC|=DNS7}{3*)gE42p6?gYB%
zT{ygvLNIM>9*3`&-J8<ICUr(eLomq0WbSgQM0gHk@!S}OC1Kh@i*w~1)}GpLHg!1N
zCdc7wKTEuzH}{&))7?dOaLH$v8|UH$J|F|R>Qzd&&mG^_JLIOy&PB-?q~gO(qK;Nb
z{0qnUs`JnM>nu@3EvU_+Xvd?rcyHIU?fPCi%(lPaAW{0V8|fLlC4SW%AJzyk4*Kzm
zb;~2lBCH(U&4zt7!t{k&0FM_r+Oj;RGK(|5iB`M54GLQeE-uCkjLPn)=<hm)UEfBP
zp<VdpH(S_q>)y4MkdJ)Ff-t8+?EN7mf@#-_SNz&yS7T18;Hq)bzsxr{v_lWo?#hEV
z`}^Lv3(*`?D1w-*Q!Zn3&;4yE)*^jxtFWB?p&-^91|{<RRKf!pSfgN1FU{hIs<q5!
zHffg8?QBY&h9zac!vSGPEvxhP@ydQv)lz15Umn6p?vB8ZPfObIaL7?1$xzTG7D_Z8
zeC&bB!ioZxD<l#OV8~?eql5eV8Go}l>ih?S3s(6IQhcJ2!~Q|GZa9!}grz->1hKkL
zUW7%^+w0?^;noctF&;M{o?3<pQE7%O^c|{mkkOIs@@4~_^hdbRv8T;2>Q(zLV;TP1
z_8*pBxuhFyh;|1s?A#o`E}`#21JfW%CDo7-R1h2=4E*&Gl{8}ijf8D5_w}NiwlV2&
zAhOae&Rq89ERn@&tYm%Sod0Y(al2~RP1=%87!2sfUB9`Ooqm-K`H;Klo6GsQq>naU
z#V^M~q~M@!G=W0jwK3oeIqlw$jZL+_JZq7YDZYQs`gOU(9OrzsA-qN7UHtjxz>uH`
z7#a!LZw>pr_cvM25A`P?ldZ*O=6IZn7qOR0qb0l30@pr;LA$-taj}AI3qG);E(c{m
zaQW3}t~;ZS`VY#~zO`Nt7Os4thjvxo-bljC!vujP6p#(r=t^H#oO-2iy)0fnx9s?y
zb2@J9b*uAmxv__qrS1<Q%J#q=V~*<eoV=vR$z{oVy}VGaT0p!O7lY55D4^FCe~7+t
zuKCnCoP4uWL=KsbMW1NiagNqENg+i>_4N7nj`Q3c<xTGCO%dS*hDW#;_!&Ce&9z^3
z!7}ni`x>-kk#v58iW-!8!kyTh!IJ0yA@&WVlbpZui@$Pm%E(p$4nOSBaah60hiV37
z|MKKq=4?r@&!RLE_=X8Ta0(^;#kuc@0v7c=#I}~}qq7&Ljr(?_7D4c4qGbS@s+kW|
z4ORj2EXS`oDz;WL1f5g3h#$rO(e89yf32hU-0ivgA6bMR`6YduSb)Vw*^g_Z`XbyQ
zY1o!ceAYFO5f0ard85^{(;AdtENI#{Qy&hK^oY$MH6;zC)7YGp0k9ji@;V22MzBE4
zunbtCQS1VsjY@u(t&wP{Hra#ged+~`N`FlgnL(a$iW~Nwe_dW<DgfjmJ$N{KJ9DBz
z*C6I;JnH#AQ2=i47E6`<n*1;3ty34+Eo702_!NKDxvbP{t6kSRfs0NKVTaaj8t#zq
z#GG=ePu}cJ$~`^LMlqXUmV8-*OIsi(goNlo`d$cKVc31!%dT-AAiTUS8m?HUT;v(l
znrRzuf^-BUT!3My494qHu~cn-USr~P$y>@@gr@R=B5;*Ufz;t$%MbqYG5+WjQ|3Q)
z{fi@_MM8sK-r&Yz85SC*C<At_mAFY&tMVj}<*%XY<rwp$bVnEgZX@#(w>mq}f!F_l
zz3F`VkHY-9B*9WO72;dcllN+ee>b6IAV4_PV%gS3ytVx^hf*d)4salsp+Q~HD=-Su
z_%p3iU#QIf-i`)4b8<c2NU-%M#$oI&wa@`>lDPR$$)<~F9I^HZwleAfG^jLS5i)WC
zMB&NIzMJ?G4vI46Kc}vD{!Xbm9ZeZvAX3Y6S<fyzEfI+IkU(tDPXWhBD*cC-6F~Ze
z)fRga)#{&0`P<$vK{+Y{1ifFD=bXGzcvYv#M6|@G-B0cw6X~4{XDy1^E!QHhcF(Kx
zIX;i8b@$laOuSh8IPvZGsbDFEYpaOXO@_=1Gh&{ZELL*`+3;Vy@yHFr-I2sK8%*+A
zCZqp+xkl7SAaKz4=?%}m?&y#LMh*{cE5Y6hSU>cd5qX$UzhvHqt386W(Tz4~{>|20
zOH2Id9H%mx#|`TpfQ^3^O{z@T%*II{LHmA2c|!1Q!zP55?O*3f&T9IrU1ClaD5Q5@
z(W9Eb5`!8}@oIHO{qRrRlIE%6=pSIg3D&IUfuvrzB&kj%M|t5S`51sN6R0P5ZgA)I
z!ku36h))KDN07EsTt43p0#N}9X`C^ncOrxqs5J)Qy!OsTGkn9bD40Y(IM|6S%)Q{*
zZhd*4J-8j$1}JrY1yt-g95@pI)LWPFl}(EooqW@+L4CrXmLYKt1BSY_4G6EXb{;(5
z9}nIIe_jI=NPPSYjfPyaGlW%UB1!fXB5~x=XO<%|&PQ&0jZTkOH$O{RW)PB|ZXLYo
zH0vTTLZBu#EGW6nc0`4iFA(Z{F`wTp*bMNa#BN66F=|^RVhY_o5g_D`$~B7UknlJ~
zD4z7dpTQyES;!PwFGuMMYxYJdsV&I}qpj@?C(tU9dvRS!gP?WN;wj5`hr@?pq5p7Z
zGR@!R`39X^tvDb4NusW_A>?t<Xr%b#$I>nI{M@cyt)YaR5Xh?DukjkzhB1XVD*<~s
zRJUzW`dYIWi%Yk{Dy1<ISa&tEJA@{$8xqj!-u=O_HY)sJ3(Kk*<{(K>vJm7s6zPb=
z+-{uDiq}g4iHLlH@w3BBE?vMmx$>7`exM2#D_+Gw6yl7S0ho||456@sz&%Ay*or*z
zV_A<U$hoF}Gw=iQe!@-OA~}vWNO3f9LDr|m{wgv9^<qoATCW4S$!duy(S+S5l^l!_
zoZ+EyzrRiX+iv7FxFCaoSSV>~&qI5%?P{&~7;zfu9np75s%c$b_e2WTya1)A2cizi
zjv;WE@4Xnd1SJB&xlM{yW&_f0*=uc;PI)#4c1f7vy#L*HV!TT}I@MzE7ng$_yVN(q
zog4z~dGh%BGr=v+LLx&LO<IkDEMZ}2O$_JoT6gYNK?Ua`aT@0`hmZ;LL+XlLJCMIs
z3ypPAlZ2t`?%DfP_w;e!kpzb!V;SNgqxa{Td><H(yt&eVMao>P^o?{x^_A@}o;YG#
z{QIR%ju$uXodv<QT)m1eTpg@Ck+u^G>|->QK{JFDJKYu)zUK{<-4QYA@chu@Fff>W
zBh9w>x|bCprX^K}Hnlrou%<xJ4wNt3zjQH{g|OXP7ec+O)M^N$l25l0b5@yRopGyz
zSPLX5Y5ts7Q5XsOFvIT4w{zgR&MIW!vln~cCMC0>yqKZxW0*Wol#-v@4-V*oK_O!O
z7KBobYnVM8sQ``g4)*$gT>$=yg$^Olf$<EokXQ%7Zq3)*R*E4kFtw2qu%EfQ(JI~7
zF#(*m8)Ve{8KNwXqu!r^!J?9^X2Cqyy5Hr9OImrAQer_O3kmwY0XssIa8G3U{^|+d
z6w-0|;oF>3u4<JOJN`ocprw=frVI+}aT4~e;4C4VVSQvtVL=EsBQu<0f^2#8tJ}*m
z>KshXey{`ZrK(L*{4!jXFk}+iVv=d*_HCGwZnn^e^Of^^QMZCHlma}JlBW^Pter0q
zh6QP@^fx~kevSh9R_f-)H1+e%$rqx}FY0{bFgCuKNlwWQ$+w5|o@fDl^VRms|Ii{d
z4<ys5$%a5V1W$-&lG-$bEtG5OHlr++sw}O>+w8pPQ7fbiWwpED#GpM1do)>mcst><
z%2N<?+tB2UbB~_P*SXds=Nv%xnv)7{@<H(TL!+q4FnxIaX^}gG$CO>0=s>Hncp@JY
zoSkbL1?1f2N(-bbXbyk{AY*teRlM>bShPr<bb0p9RB4-Bzv*c;*$5A(3ip&n<({qR
z6~h@1|Fw$WguR|d!&gSlY_VL*e!0J(6v5zP=`s5%k4&$A%@NdWFm^EU0}LV$CL{rs
zQ=V8b_U2#&<4<Oe8)zKJZ&%+=iP$VNb#PuRCX-AhU?SF);l7N#4w_EyvrGICdOgAP
z{FT++d^+D{&}=Z$2Q^>dZeRYBL*x|fz3<1QDD4MnK*CpH3v)NC;l9DHx6Y^dZa3E{
zez?x|m2FYz>&L}E<>K&PW=Q1C!<a$q+VO<!qf1p<Qs+Fb@lY(^MV?p;&n)y{X{ihs
z8+r)S`A9Q$>`E{skk^gE`Y&o8uTcVPj}PCYF_xjwLEmsK5rfv5FYrO)D9ysaCXXzu
zwWUgUdAA|lZ?T~fPrcb3YvZY^Rnn$BJwY=6hrPEBimO={cZ0iSaF-y#3GNOdcnD5#
zcL`2#2~L8$OM*KD_h7+-yGs}t+=sjPww!P8ea?64{&(yCepOUaMJ;AlPj|2B?)Q10
z*VX~iC>M>V($ED9*Y?og?QCR!%2ZZ)+nW0Y>+N%9HyMJPn6bI#K#@?a+Rq_+35|-S
zyn4}B<7?VNe&pGo79v&jvDy3Y0+$NFy(Tnd8P?Gu>BOuJd^AEMDV-X06whcxqsCqb
zVXXx;LQWiF2>ZK(obptjWx*G-ngq;`{~81}XM=4|euEe5ibPorE)OadwN{yK_g}`-
zMNkqJA4;r6OeIkjAT)oqC~o-eASAxvD+1!X+fMI$?<4t0U=3CBQ3H<AEd|H=98q~o
zq-*KAnbLgjPppxXDc4APuILb=u%=lArA3TCW2Xg!1>JEBej)uxpx@BX(0lCJAH<&7
z4EBs#1V&t5-jbg`>5VC-TA3x)41=yb0vc`{-<Zf<lwm@~RG{Nv_@NoE(W9B2ph#S=
zKp~-!0KRO+`#X6z9P!_2V?`WY$Jf*0$G6Z!ZGpaHli50%+>f5+n!|~OP+NCw*8qwV
zd>b6B!&T*@fa%G;NRODgx41EDp9F$PZke$cxNd*0JRo9IV7_?Zb_>eE;sWBAsg6yz
zNlr&)?(^>yDXb^+3h~f4hf98dXYH@WnetW`V0qx5SEYfM&(s%D@zWc}@2ea*W8SQ&
za8UF$kXF|>z>=sV7;&TPM7Ydp2{)vU8rpqJEmLjw-o;dH-kr+4SL%(!x8;N9d}qf-
zqY-HDcrB*;x(WlNkTx%25^HnS!{gt4wzJsg&|LYhI<TJX2!G7-gZ~bl#v&|E@BpHP
zpOImDn6e9qBovA$<mUw29P&1aX+%%o2ytR=967<<$4yqSC&#Qotho#JBD{<A1btmF
zfD9WsZuu>P>wW@V1GUf(P~l!(Y6y5VM{Ko5E%lcH4kYXuS3sqHo5WYO;>%qY`|)7e
zM>EkT21COO6?jr?<SSnht;ZO!mERekd85V0H2Ms_aT@FZEUPZquVV165Kg3DH=#lJ
z?EQi*@6Ejlfz$HipgxWsB<f1TdvdOD_Ga$OPO#_t%Fm!1l58AA&yA1hScv6it8=8v
zltY<JyxSw66<be4<`xJ(VO=Vcd|o=8;|qS4W-W_(m|V?6i1~UJ(zZ4{p^Uh}-x=?J
z3~wKRyf|~BKc`{+gT(-r9C;^d91*>U4#VZ5++x$d0=iIf68ZHv*q20U8+GQgQXgRj
zMe-F9=#8K&V|t}%zt$X2@Y^s^=SX$HRbjTC&xlrE-SW)tCxtQ@o(R#lvo5CJu|M>g
z-BR3gFx&$Bf_=`V&BhXo=2}>`sO>`{&O;9ozSstKyqGIGhKo;FJJQnpRFD`CtwVal
znCBt%Y4PQ8Cw#D(48(uwwRI=-l`q+cuofo9z3Jh*WjH7HrH|@F1!PE1$6M8Hv<u<k
z;9Z?8jm2G;lGi9c$w}x0-1|&*aRw80!4n6hDEHRiEjOL8Em~;7?fM7DqeZi%`v>H8
zZSpx>p=jWj;Ua<P@`8fh%s;3wn00M+i*$X8x!=?w&aJaMJ1S9L6~|hYqbyqKJ!=bn
zo80d7F<WwUk+;3W0+0GuQhM2*0LHR{Z{84oljPG)+wwiP@4Y1D(%sEd@Wi7%z7<V~
zAFaPO2LT>L2U}^FB93giJL{vHFg3S+C!%yuzCsTWbm#txduC%(f9u>C-m%PV-yw4n
zrN#J@?|vBw)G{fA2X`K{y;x94tIu}C+^BB*Jx!nn0X#c45C3%fy0_6}Kk3`Gbc(F5
zczDnY>Zs#&M@N@opm^YV@9eE{%sF_g61?zWvH^ALz$ku&V8dk3b?7sNRhW6(@9O22
zMj?ezYG_Bz=<{lYigBC$_-ME^9RdBi1#c*|?&uuvdVM#kQ?f0NV?!ZG%xP&PcM4wn
zTKc=a$9J)te32;djF?dZ1$uDQ_d7{3ZV*x8-natSTKCL)du4l`9~7F7QDgpPsv7+a
zMp1BX$kD|sLr%Fe!k;<^nF02{eqj^B(36vN_rA0p#t{7c?ewzU5o8PR>v9q9fV@_v
z%<!$iBwp>!AJ;F*!M{uSu_s&1exjsNv{U{suk)|3nuFZ?0v5&Om6M__jtV?3jI`MQ
z^lJkXmQf)}(DkcB-e12S!_#%D9^6j<^J@X`LwQ7*)7{&omV8V1Uv2#Ndjr}p7DWwQ
zM;M|B()>T>_Dnq}ybr4h5w!oeHp9ZAv<rJqf(Z1bR>l9Pj{BoiKgA*!$fxtn=nHlM
zpx#Cs`$^-fUom|=eJuAlH_c!76e-TQ-A_H6WY+hn1K+1!G5u*!{%40(NBE<HOz`YS
z75?&mkPUz#O4#mNO#9c}YK#YiN7v7|l$QE$)8Sv=8M__1gq_3Zw~~CtUk^^kr@^V-
z#LfBFP4>Z2F*3fM$xPDw%T|0+e`-akqmB7rkKya5F$^geWNH5ER+K)qLSubq<gX?o
zZU97DEF^qg*Imc?uUnz<)QYn1@>GDq`uE4lpFY3!NTADl8jtF>{_3g-v9hODXocu<
zs{Q3m>?R1P0?~O1k>9QlQ0xySGOb5?k+R+l#ur<j+)-#bT(8$`f4B!+6Z{pcfV?!(
z_j-Mx37W=n>3iL7F_PvgyBOs_5>uV*nhIIgAW!a_F;taWixq|DIJ7?`L4h)a3`LGM
zb)}k*1)x=6rFeffR-W~l)$ieg;Nkpi4CdkS_z@ZlOp7nRci={UqhFFZjzOKv<6|u^
zi__V&{cdRO7FzohRhULDl|^zNM?I32K3-*R<LmPJJwEV`>Q?#twmVWa)I=rU`1tLh
zj~Z05@3mCwwWVQ_iI8$Byc1sj<??R9{@rl?Ngz$t+0+>FL*QhlIOX^23X+k=0E+=a
zf=|BpC#0GD-aK)Qz0$;hv%tHPUFn{fnp$Lq2|Iv7$nBnws(dl`O!NUap|Lfyj@#aY
z5}T)6$|uETceHqf&7$}|B=YT!Eq{q|qV=n06s(fvMCacPcIP()22E%_1fjrr8^r|$
z-w($5oZKGE*(2eZvt{2P6wcsWqE}|QueO=KvmVt0`I~?~E`j4r{`^fq;JJ0-tZ|gw
zhxa8d%xj<X2P~)4xW(l|x*R%lsOpTj8JS2AHqZB`W?U3juN&4=evZ9-Xek#`+E#xo
zp>jOgsC?I=59h^U!^8MC57%ttcMTqeH*IKju3_~?0=2sJcb}V@?J%gt+^5eQF{M;K
z${57Fx#+32DidD{U%G}8SpHnPifYswgF@h@OCAIYKT?6VJ!sj?Mn6X;`8JI(jois6
zPoinpyj|a*eJOvL@vc!^5|7b~ssIZyz;S<9(bM7TyVnJ={<CzjKq`R4s*pi{HiGB6
z;@jjaHl;Pin3>P_mXj$DBQPVtC;S3Kq$F0PX1>MV$0Fh(sQCji#pR3cy|0~?ez}y`
z@yein3rB^O7rl=IfY>t2-dC{p0G;?5_CyDn(|iejXv9Z1cY~Hev;Dpi;$T1_^u&th
z0<wISmB5f~DCBeg<(Ab<2LE#cE3uywC32j%7qbTC?Dp67WP<K*7CvG5K5wPK$3!>E
z_3B*f>c+xnd#~Zq(IupL)#(56e2!zCsmq1hLDF0t+hse$^Hfgff|$Dqu&1X}dPU1V
zp^8q^3$DLnL*7M`@Z@0FXiRip?TjPnYuo**^BnIISUPb212Uc5Ue`F}Q`Ik9dsnJ0
zADj!+-$C2Y{C>G7=k1Z#@fVosI*je>aZO++HL&0NS@IVk3`9X%x8Jp2wptCS4C+@?
zmZn_7*_*3pdbm3Ft2;hh{&1VV`K#Fk$$GIpsIV>gr1GDT>F5wImu{!W(cH~TTJH^%
zn#wQ}pVl8ydku;td%sTdx$c&UeziP4llLZF)y%xb*gK*BGMU^W67Nunw!7HwLZDqO
zBjbGqE(Xl$6W1L1`y+!6dsARtGxd*aak=Z5%}Jt&*yFw;d?o3Pw)qiwjN^Hp$Ep<>
z{DDaA!yjFJ+J~85M-~GQd~qDdjh4gDK_@!nm*m3sq>7nB;V~2<ugh00U*0c1;UX_`
zUh1`JkbP5us^4Dj840*So|&ynuLxWmOaarL-blnSz<CuW$P(fY?T!qX{qA@|@N@`s
z-LgKLL#uUnar>^^8c)F4P?T(`F*yS7r9xuqbX6U$#Yl$qu;^_HPsMvh3+0dU$836H
zQ(9_JxNK=t=GR;f(`ovHiNlydOO^U98n~i#iiB*(wSYr^!qIl(`zVXiG}RFeF!Ot*
zu9hU9s3G#@?{A9;3<|nKP*o?oLrmX-Y=3LmYEX%^W_`R1H<$g{dSf~9<w}|fJ`)|8
zscBrOYsR2VNem8t9S7@NXZoNnXnF|%+ZmurixfFZ!p6VSYYzPD1YGyusATg5Ke4Gy
zNaN<>NXB$IUH1w;ewwv4fORZL9aPX^cjxA(%FKHPo$E*Sf#^2+!ikfBPE>yaCydWE
ze1SX`hf_f{1o<*lqYH$&h3SS|GqrLJJGtk&73)}|yp~wh%9cktaP5#`w)4Z<vB{%M
z!4gC0y?q3wQO?8J!~l~OU*&m@ok*$>KL*7sKI7*dG<<XwoG54hlA*CVa`DBGZ_~<w
zkHCX(E4s><$K{gr+uOF#GT+=4_O*)DD%F35QT_i17<JEmdbEJxFhX6VPPtAUdqFE6
zpM}<;tapA<xZT1swEp*;_B{@-RyyZR-UASXkX+zcpk-H$5JmP+Rrq%twOB(DX6?ts
zr^Me5?ZR^Dvz7Vyis@c3E|75}KsF3h4!x#%^O-tiJIpT1;BZ;~y`XCz!7K9uBytb`
zp~RPp+i#Nd{vc92*WBz^vt-bTx<9URCTy*T0j}nn4fi9xUevQhreqXY8TPL5=d^is
zR+B3G&uJ}JH4U^q|AjQ`6omi1x*iK2qy^L-(6-0Ut`9&7H@+6?LpQBjdzSJIaQT{K
z9Kr1^e)*zT-+%}A0eB*5QGgr^hrooWAL-8RHLzV?-0sw*Vx7GHau>WuX~}~~oSiEk
z?L_2hoWZl%mhJ;ShKoWx$?LoLJzHMVHOb>j{AmC&7Tp%i6@DXYsUwKvp+9lD<1W$c
z!|mB%yG%#C;$dM4$`tgACy-rOvy0>PaD}vwt6+q%)>_d+2tGw}rPbSl>$Si|SwZ#h
zdy4%Hc))j~6DE5tQ^g^E{oY0~fj(lnq3&L@B?T4^Owen(5f=39Sa*p#c6eqEzwYT@
z;ZqGCKm?k-BqGiU%x-WbTS1=F316pP2p8~s9r7`uy0OCe3Fa#N=C!XAY;H$7j^Z)(
zP#tmNv0d20qd5K@s|<BM7P0CvM`Ny7b4orF>c_sGXdH;*j`gi9`l*7MfaDnhYF=+S
zJO+K5l^`79)#0*{D14KYBm3nAjtC17D=g<h0==qFEw-8cjwgIRE_uF}AN#gEZQ3;J
zdbkg{Fn;h@A^*}&M~znB7M2ZB-=;52-s%<Fp_u(OcTeMViALfVI>k3naN+a~c`!fg
zp6?DD(#5eXo7tem)rvRUuo~*y9R;?Pn)HfQf0*ZPVjb<z$DqfD^Ax5h#6q+y^gvYd
z+<Ly|15ki3a9r(W7M0^>KZY{j1N8RJ0c+9r)OM+EJ|5aPk_8Yut8O>VlBPt9V6`dy
z@&bk9pK0LrpVMkNmK<YKW5UO&eA|F0?6!q-J9Ro;_-uy_LP@$3Kzp+>2A3Vy*yw59
zroY+)7qvTCC@+pSov&;l7mLI&_618MuIY)S(rfdY?xCoqm(6O;x{W6G7MZ{KYVkZ7
z#%#XT%toanhp2{r0d+{|!f&cPrxihIvfX+6TLK(3RDURIiqreZ`_yuU;&`CJSO|lJ
zH_-#RCs1>&4A4z_-WLqgd5xFKVI2#(uLzBGNrnlHnChBweE!T@^^ENL)`m&>*F^B3
z+SW&pmT(kSpls`Zab?QiXcf)Bw+80u2~k0uBP+3(gD>l>R>iAi7R~$ha>UYm0nbzx
zR+6t}JzL$b=;QrVt=j|Q2?e`lRTvR%cu%12FveAMI#0Ys!vhtx==~qa(xQ~`$6G9H
zKOZD}WY2V-`@Upv00W~Kch>_t)qbTrsCCix@CCG0UHo1#Jyi6G=WI1eH$m)Dt1za;
zT0mnb<n#?JXf~WMmYdQicN+IQfrXj+IofEfPyCUuoycz!?v@#U@AWuPg6U1uw9QOf
zi8J>qDd(jA5i{;d6&Z!{B4(x#Vg7jb9ZeJrVX=VQ(Z-nW@&?CKpeEg^`==-$z?!K+
z9t?%L=V0{_$C|!V`LtZiXp?Ap2(2`)dD@&4?mvQ0IDD&Q+WB%7e!vM0#wfda34AzU
z9&7#d+lg4P6D<&{56jF#JS@uKb*%f_xmxrak@4$$a3eh}79?vAI4>n(-R_&)k!p=&
zzRqb~8&mM~Y^joR{Tor5PJjOvAOBEWT3ic1O?wP#KdzalKv;aD6q+sLOSyu|u)mQx
z!HVvHnb8#)q|g>%FfbS6v+l5i^qk>`XaUL!oYO9g7K0GWHAY;zmz#A<D6dSjRt^Ig
zV_Pz7O%AfMmR~yijma~VSf3i<hQQiO$u2vDERfBkPQ7&Y$5rr^xCmuWsS)WNBp>fo
zv718cI~9oHetV!)L+<XG#&!G2l6$Avpx;%N@E6i+ReTXI+UWZc_W)-u2Rsgfb5z_l
z7^jop06+IDdOe5(3zi59^AYbTTJ+j93j8UOpB`C=KTEgXj()6lS^a<vOi7q3(Br+S
zyIkXHQ?+KaZsB;g-Qt7huv<Sa`b=~r1RU(C7&4@OsHA^&U38+D8~e<_Y*H+<yUlu3
zXA;XxI>HCJw=$P`ZA{-`?#J1509;nvUHj>~3Ju{spCp3N7Y-xb=+!cXO-7i(PGn|q
zewII?63c|B9w_>GMz$wbnfgTG88YC*8A@@`bBuT`mr?}#nA}6yPr5@4woKq?jP|y^
zs%@C3AoT1(zQv(X2l1?g)W7b<P_UXMZ)15B<27f07G$`04mi4evFp;p3))=#JK?&c
z4Sng#4QGL}gdVPOgazy{2sfxA_b4DwJcaI4aoV>l0JNHHaSsOqyNPjo1XA&CX{0on
zg9Sj+3%CLXf+5rjTtQeVB!VymGz<+b1GC=9_R%V{hW-s$<^xKG3JNXUL0l=ql7@R-
zDI~_Drbb&)*nM?qXNpv$bg=C=Dc9|qQ=wR7BGl-K@0&2z*(bZjFkz<<in`Hr4B0vO
zrJFH`LYY+3O%}CJ$^37);pyP_{S`c;vh_DbVE2Yk$j%QIkCT!Srb3VV<x)O<vXlgH
z&F9_7y?t0FNmR@wMD9NMU4&LX7RbL_Jd7tNZSG|#^=>^ml~Nqw>UVy1<X46{c4Nb1
ztvM~Bf^e{4>F?aZFBG`YtarRkScFq41Cc&FO_B}Whs(wIq8~GPU@$7^b56%eYJHoO
ziurCBB_V>F68t*i&~TWvw)kv?E*mU}kMG}bOswus!{BBly6sC5g8R&M^)5CEWDA0~
zfysR>VnB?9;L1Dz+cTiUKRuNKewe~{Z51pOs4C;1{kJ)x+;we?d~6D@;F!)YC;<a|
zl^ZIXVTk)Q8z@{u1SFROI%|y-GKDQFgn~5TRvm)dQb=+|##ffGH-ZSbgPSQfNVahc
zA}#V2a6sKs=d_{YZ2r7pUc)#({*yuMce%$fRG7i~ok&9EI5BYXhNcJTAK&Ta82Th1
z->rMUa*1)-{VDg}1Zq{h&uM{L<i_iQwT)bdg`6e6w|WvK(CHEAW}SeiK=R5d^O}3A
zky1FG?pDAr_y)8<*N(th*~JUjjw7KMCu&5!GzuFsh?J6YBYdB@fdzXNnoeyFU=1%V
zE)VOx#|XHx8{v4Z7Xp)*Gu#IZHup|U-iG>k>Q@iJ;%(!;%xcIQRs2ahKgxo~fL)z=
z*&FU)>Rq;t;~B(42)KhTob?+_IZ9<@GfGlL4fKbPZ{B@H?g+A#MEf=)jY@#8t68d9
zS-8;`@z!&tD3qls$Iw?J5XPQS(>EG=b)>*icl(1TcKmyEy5-8DHjIR~ARJaN43Gzs
z@2uEo#*>gSe8uSy;wt|d%X%sIEuF?bPXc(m%=UbS#AI`K2i5fhcIB_(ODWLq(6mj0
zz<m6UY9$QzkN(wK-R>D^<(#i^ZGUt*15n1cNCE+E(s!{zelCX7jts_NFg5<?TojV-
zXSgN3?eEO$N9rfwxuYF?F8sqm4RDsN)-rY_{IHxk+^tO0_8J`HbGp{)K&c!a_iG4_
zob?lsDaY~IAL;Irby&?|g9~=f`Bz-yV3%pPnCPOsgs11+@mGir-cTB6vaMXegOshG
z_D~TI;JzMtgX%^&Tc#&&@iSw)M^mKWRi#GG-?>Z}&sPe^3U?Hvsr*52MIT%Lz{@^$
zF3*kKonm#}UabgwW8dB;joPOXbzW9b+u9F6OkyadsKhEr`L`BX-d6~zd(@NSvFT?F
zB{04;N6YAs$@h6nzrykUd!5G55D$6X|3EY2rP$Z8KY2W7-X4t;X~Qefn|~Ed@#Cc(
z=p}aS`w~MKMs-*^`M<N7^D9>yXlyAZzLi&}8lCUP3UOU{m{SVn{q&x<^8#g8o<k{<
zNsfIUAFgyy<nfuzQ)T$wHiDZ7qPDI8dzcHISIT0n?ob?+Foj`bQe)p(lA2C3^TT+=
z!Mm}w^reRPXG*V0O{9pk1_<BWUW4FQekkRJHV$04AM*$oH^kZ%$bAu#0+Nn$YDJ;1
zlRfpM=RMui6ImAIfU3n?L_5-*ytnqsjj(skgWX81#BXjbds}y!6JGYA>>G>p2NHo^
zEL5h<5(T1sF}}~N_K#hH4E9Cf*DIXcF14o0V3J6%%rC)WE<O7PsyULn4Bt}PvxK>N
zp|VAaU4(x40X$}tu=N>8W^4{a;e>dU>u@Erj5>0GvF$zpAZN`((-X)kP15g)6B>A8
z!!GIHEdRv9;0ze$^7z3WM}!BKlklnxC5)w+ev*C<@BLWfGN-K2`#@^Rk!}gc*5m>8
zVJbVuXz1<961J60+Tz%^SdkF6?M42!T7vTNy6ZU;%c{pMDK8GPppo$mb1F*5P^Efa
zL9D1u>#~@@m@Qj}kF3q!#A_HL0W8UD6(A$Sn0y;`r|Sc;<LeQ)XAE)Ulx%49(N%9?
zPetd%`3PP*22u7Y8U9kxIWr%_^(H0JnnTLJyDkUvIGuNg2D>{teJ{`Nmn|&HI9?k+
z1G0^p*`=iCJ4;F4Wc+Apwcmh812G84f(c<)Or7PQBf5LMP;e|=+f70iC-q0zE6!%a
z#>D3MaKPn$KpUAZ+lDCC2^EG%Hs*gEP~g~WziXp^aYd_STJL>r^2MS!*w%&n8@wQw
z*_*`Eh^$v%jDl%w7q@?8jh-rdD8sv0^GmJ3Tf6u@;B_r6Oxdb*kPCO&g>#Gg#82$<
zMLoN<pX=@}@Scg<*N>knY2~4V_r5yJd+&V-3HR?%ZEy9QlL5Qk-mXy@y;V$0f#3Zn
z*vb2`>7M3bDYp$pq6%fE+=7jm#{)Z&<^94RSQ`ecyNM{pC%>wzz9K(f6J=_OtYL-D
zCBrp=sL`Ewr(1)TcRSH_f%HH;(4KY%7X-DzSbXhDELidAlIf}7kcAdlo$}(JTxZ}l
zz;$Z;cU-4+O<*9X+KQG@UqB*9yIR8Y4aLhtg)c9Rscy169;sK|oE%;_#MY@|Q3eGG
zK}1_16HN%;0mLORkwK*RmF*diU0PkjMPag(*e8h61?ihb#guW;$LA$_xuIAIF;4D(
zu$^55>ej!WXBdK5@y={pBY&9EXs!y#_xhMJ;C`A#OF-NSo{+<)T-ev&adEi!4#IMF
z*@-tI@ZCyx2**&E6CKTF%SN6PeXu1U<97~zGY6k|DRjTwQqJk;bd1=zxZPq%qY6LK
zz=;OYw-|%I)_!Gn>Ls1=DXNFx30bp4q>ZKH3I9AYvY1$rO=i$nIAoQWLxJIEAna(c
z9DM$AxXitHO8D~`IkSh^dVFSbxt2KD!io?j0*`8S6?(JS!LP5hb{Tskg5{>wCJ&()
z8~uyv+Ywd39L5oGVq{}9F<M|yr4;+l_4TV>->l^$^dVt4F+D>4x%Ok96})j$=_i@J
z10GwcSuJBMlY-Eyx5%&h<)iO1o6@Ga2#v@DJaTZ2_G`=<%QlRr0~+JPjnv_dFN{!Z
zHa6G$FVdMTJe_yx?tS+A2wp`@1yo|`l*;lU2RDYP#8$jF<vzTUPcAXOIjuS6@Y%y=
z)cD4wjGYYPI)S|rAK#No!Tt43J4mDQ{n&!R;MR(}ipv|CqI(LTHF3E#F#q_lcJ0u^
zl|_5(*TWafyVQ$R4zY3S_I$>8gkjNZ?=1eof-d}K)`Ky^4#{0(>sYcW-wO~oX(9J{
zRpheRru}@k<;+4!Z4r~~MyesE#yYI4;klo(w`Wdo0B$Jz#PxAU2?^=X?ns&(fCc@u
zZoagMoww?H_r6XUQigkl(~0Vwd?Y+y_24w|JSw8Uo{W|~1VpDu#e7b<PO*f~s?Q8!
z4i2GmTX`cnoZRjZO~Y40tNv}*!)eQciT~V!&ldUr6$%=;0HC0cYe85Mbl8)x;(w?W
zcNFV=MvI?2d1gJ?^!eVSrPMOZ=erF7e>7RDzK=NOtIhHy_}mwkqBdmGys0Nd`{pL?
zXh^ZS|AK?wi245w2aVFet@CKV4#X1PymF6k0L4c4<YVuAoH-P9K2L%C=o*6W7~}V|
zboGPIE4@my$ObQrUf@-~PQ55BJD)##uYY{Mz4D+@L&RwwcL#@rr5$Ekok}4$g^Ig{
zc+qGztdYiL&-~;Jl%Q*8tEN@>@NGB6x<x#KbW<PLCXEh#O8R{}%_JQF7r1BSV>pzV
z7e?;;sp@})LaY7>h2|Gd<R6i0G)BrZoa9Fy4~;COc49u@BI`u_)&^m_m`P$I*R<(L
z%nfc1=yZ!*o2a6?kdVS!)04J_d3DE#@ZK+owmQO?q0+pT(%D(5;13T1!2dJ_wAFMP
zR2e$uPc6BM;hB0S^h&rEoLOf#aOc=p>slVrvxV$)e`UWI9{3uY$A}<k<Qei()*B7x
z?jRPpyGuW5;OsswUVr{4d3F(x-kUhy<I(c=YONQQW$ZBC-oIXZB{x#81y@9<f18Xm
zq5<4-Oq8NxVlXoB#hq^3cyCEtV&b{Pi9GvU&}?E1U4KB=Ba6WRS(nqll({Ks0w466
zM2Vo3W?Up7!WnR0m~Nxc3sL-o6&2OR7k6ONEMuToHYx08pAo0&{bCV%j?>-Z1fmb`
zXQVjqF4h{0nOMpti%`>&jU|!gbH#z(Sb4aub!0-CZ=~wAr0S-0%FsXyB|SJHV1pif
z>rII=d>$xHyvQq7Z!=Q>EU|HvR(ho1iW!Y{WoTyzKe2YNVR+eWA$y&ErN7}CaqH#D
zDS_Rsc3<@!1c7jV@h;-!X^RvgOT0wVpgvh{@1nO4)b4E5YjY#Bp00S$Flaf)F8fRe
zR_oAexcDJc$n$MKQWTDsPXwwsdjQy#WwFjEI!XAI_@1C+oI*~E+Yy7Du|yXVzsg<A
z`-1l-k?0zw6AexWoX9$8)7CRdcS6lB+S?NIl+RS(!XF|BtBE?W@g8N~Ug>aKYnfp6
z-{^lt8vm-MdMBc|!fx@5SkQ?2WF6VLFRv;Wk4aM%pC>bbNvU$V(J$LWTyWG10MU^9
zlV-pIKbl}!?y}nKt$R|0^1Lp1MKsqCc>jToDtTpg>oB|8OgoRzVa0~bB`v4bvvFYN
z#ahkP5q<0@_M~jILfWjRFwa%G_@-{=O#!KOxI=r_X*x&^Up;Pnmvm>ILa}f%NUs4o
zZG9j;7IJj#6|8OJ)af)&LGRjn({+!)>`V5*LGjw6m8`P=#W>;$2E6zfkiuh0jRuR}
z{B<|QvW4zZ(8<;MPFeUOfqAbM<MmgP3?(0W6k_WV(Tt>MyF%gH^JegdAv6S(*Y$7A
z=$|=Ja~JW&hGXodyl>iwVB?J2Xb1Q@t22E5{Zuf{5T}wSAUCdBTE%z-3>ggutN<gL
zSZ`B|Sr-=4;^6>M{dlwGHbRv1;#nYj3(l1pAsg|pQu!7)*&olw2Lg})4c^@6q>ptH
z2e`yhDNO59b{Vi3wB)?^I{TBfAr`-9YGdornBm<ICl&Ek-Xuq@?W@oWdtD`ppzP<N
zUmHPTa(CK=TiSflwAPRVUuGnY0ygEuH;JFHz7!JGnh(C*iPO$e?pb<SX-$hNbFyzR
zXGzCnW+h6}k`x=Iyz@n%n{ns5I{&5IR_~*drryKs@0!f;^AXd-0`Kdfvc5%nYv~H&
z)EXmsTNL~#39V;=qdhM{^jf;vHq&LSi(xjuzS`Ky@i={~jHg%30R$S6{mA!}tvXzf
z!d`)?viHE+_>GTO%Jf&0-=;vlTdBpDaoG;!0a->tNQ=x*5UD1&GXg5z(adELJTyof
z`gx^7ednP7i5oKE5JQdr?DvdA99Zn)p#3N80?Ue7HC`XDpT~_$t#tEtgZeQ3hbW?8
zx>h`d1mVSDE~7(28|l3y040U_Pn6UM$k~<tNWKtAIKqA{u?MsMA~-sJAU40%^Hl1^
zZt;6y^%<L9W{(nt*FN964A?^^^KN60cgPwfdA$y`()txhk-?>EcgS7Y07zQNGQa#c
zlvJqcs_2b6O&#Vqo5Ge*0<b}jFII|A?!yNO)|;#w8$k7a23JC(R)w!#yzoAw5HZE}
zb0GU_UBHgLId=Y4?MajH9D#WIw?=t^Tc*GuEFmeCPfF*ewG(-<*l%q^Ohs~)WyI($
zYiK5)^Bz7YX}@yi3FwI@1yj5+RW~AdT?Hju-H6@&iIdIB%#kw@WH;`{qF=0Y#a$X`
zXF@31U(JpqI#(N-PE`9%{i_0!mSZ@0lu}^^2xTxYjzdWQL6w$xSBkuA*1-smY=@7`
zoRDtc<aBGyiCyopHKiihwwpS@(A*TRbHE?VmR<Hb|8kla)k%1a=KRMO<cDunsV|t9
zIuTn)fSuCZ&=#EGR@e!Vm7x-B_gOkk=t-|%1_Q1?{r*gIEi-%id9hN~j|W?eNCSt}
zBJKyLn;K0&yY5^431#I}E>-w0Ce3pE0GoWo3+dO9RrUw-U%|H|%5e?ZWAHy=Jh#Nw
zY2U{A<?bB!@=llPhBch&m5Ij}eX)r&R}9wVy1Cw*#%34q<h^9|>AYsnILZ4V%lup-
zzUt#H>qhkLv(!_mrO5c-eq^b~!BxK4Y$Ya3<iL_0ZyO4?^@H9e#aJ3`BF)w2t6>1_
zbEG#cmMw$)EXd25KlI%|mj(?ugJI#=6mO`N)+Va?oWTnQ#`#z}vpzi0D82)gN<On!
zt$kOx9kK7Hh(j*7>nVNuTFr%+YI3qW^N@G^WqOFwq_KGdb?U5XG|wEmmiQ;f+H@+I
zFLukvN|f)#>a{?ghL66^$DsyHZ%g9NJK#JqZSlFr={NL(GWe-SIK+{WZ$HdhoW>(T
z!#sa4BvPag<Y*qg?J_>qb%+JLL!qy=Mvgm`i81<eP)aN^Bq(7>!rHvstqH~@5=lyd
zVr~1MSgEMFL=P9E-s5g2Q8LVyG_E9-*W+F1fUq?fa|@A8F6PX6g6hAT|ICtXaX*Sv
zM-|~8JeESY>I#))mz=(YHKg4`48y7mfnQ|rDoADFx@>Y2Ez20m<Xt`bIQ+3N0<Y}n
z$$x>U6ScMm-D`(z=(cAND|_Rf@m2p_HYLhNl_BSl)M2I(O5!_$fXS$_BTEZH+);c6
zUacGvjue_KDj)h}A{!Ig-X^3L07mSZ44`OW{#P8;<3LoJQVKo?#pePrS!4>hcBcHd
zFzQbq&WF*>l;x}ot1QP2u~;#9kpcv?0$3bK_ZX?lE(s=xH_F|uW#p0_5WcJ+MVf?S
zN$Gc83tK)HH#9j6SUl@)%S1Zk6`?^t?Y$!;N7vErsOHZSOAyqwx9N_&>bh(d^Qm%J
zcs<1uJ6ar9BNtBoVXM=Z$R=pr{{zM<T07~3DP<O%5X!aE8vds$JF;W5Y{!;u`A)d5
z44ei3u3x+qt6rBESF>&(d!(+;{vc6N{(BO&*Ph333%?Zi>)YXOS#^AC6DP8NCs8|X
z%R@T;iK6~yYZKf@yoa2Hm3KI=zt2_w1b2{k8?$F(Sd5KS4sT5lL_^LIa>O1IJ5Iw6
z5N)eclFq9KT5s!ArY0ck(PF^&1aZoVlSn*uh-a%!1p@)5r{&7G`m(oyFoECsUR<b<
zUC~}68Gnh>0MY_JK+uh3*aKuJcR)^HUtarrd70lJUnp{@-sqc1(Z@Y!!JnmkT|<G1
z8;0Fd=%(X8qcBkmnN#%U=m{r2`{~KkBsAEqkX=|=D6PdDzxXBoD&aeqC(`%OH20hj
zhewQ3b@{nh6CuyIG&5}31FbN6n`AWkxiYVV#G0<ox^CT>t>+0ER7wf1xrPme(7OLj
zp~hup0!c2bS56yO5D~d+!vG{|u;Kha`!&iTOHlM7VU#B$G}*xG2}AX!`*n4qI8Kq3
zT&aiC;yHZd_|no<{2824yjmMkI{}@Y50-G<E?U-$6iOVUk62+r>ALOkuo(31GXyRL
z4)lI@vqu!5tHwJ!zQmVWx=#5<yXn3R;9tBIQ?*m~l#E&kHXzmY_!CUkmtmgn2Kvj)
zhMlYBDnAQciz_>^L>q(M6+xyGGl~LhJ^DWgomeGuAV5bIw^OPt<Tk;j&&8nxzoo+<
zxkC=lG&fcd!zD`#FVd?bsX8vL+@oknj7zmMO<>gSHWtaTLeeF&SBmA_-&g++01lRB
zt=B{{g)hV|&4~W3`icm-ePDE_Lxso*1`O#Xige-!Tjw`&PyZGm7!HtSL=StobSr~H
zzgqGB0x!i4NRgs6QlP7_`fI*0LTpJKpx>N|SuYg-nvx@YqU1PGE9w3hB#`Wh;d4KD
zyDIe`P5tv5d{TY7E+1mO@z<2x|DSFT{n#W{7(buV=6`9ko)~3UBfYZMMt9_%;Gd!8
zKT(E%|I|)!0zOzx^)J}aH#u#}v=Y!kFYk_@BxTiz-^~mD65b4^f&D(3Y6Av>)azRA
zI_Cf1gdYK3HPm6*YVfZIyVld-tVV4#FZ$~yvjMy+`s(+oFMmBN{x5a=pC6;x->KV{
zmYOYk<*dBEV?V=0aF5yW{&hHL#zn6KP`)!hKzd@G9=J^oAB4!UMpL+d$21VF@{pRa
z{eHC$@Q<vK<z2?u0jqwGnb7Oa*c|?h$9?+S-E6<WP*r8=H6=<%-t)7@yA@T>FaYyR
z*Qn4}RD0yjgaXRRMHY!4C!&w1@(XpI@;^?vBrK8n)}!3m9lBs|Z@NRqpSuy=-f>!3
z0_B6nCG*M11$_7{Q<@H%eK-3TZ!wL(L-XL=N2JI5JDC{xX&g6K{ZLwYZe4jlK<_<e
z$ww3g+Kr%p0$q6x*amk0&cuQej$52I5y=F7GsZ*VKBv2@L0@5eolFTchlHC9DCg`>
z7A3C&%l^KvX^Y?ONIpXQ2#oHOyswV)0l%yEHU-xQLhwS{fLHsX!}w=O!Za#1d`7kQ
zyx$x8EpE{|jW)_eoE=h?xD2l=HN03>URei-0q0wU<vO)=ag9wuCq(R)$mLqiL#Sc5
zd>?P?E=Z?%+wJ&@&RpI++B3zfc}|O3BQrlZ5*5BqyiHmKbcp4^26aj939=lScOH+_
zu2lu7Q52A-q)cZ3l*uB?CqbYG?Omm+39JjUPTkgn?j)~gsG6wVie{xdkp4Y6w#w=Z
z?Po0JYPmX4eE<C-=c54`rNV)6x<;jbPo!&YKJ6k}$31oHa}Nqy2lx!f{(=tK<97I5
z1BG$bo%-)rN1%7?ue=UD7QTO!RWPL}wUQ|$iviq=bV^yzx1IYxJV-YiHhk*PZ)LDM
zEY9|8XqI{4S=ue?S|J5fah|C?sYdl~PNx7Q76~0weAPz1R1M(tLXd1G^Jw3<`^+;5
z?LZJB@d)&XuL0n7nC(~{0Oy#F{u|_#EtmRs!H7oYjX)D|x7M#5C(uwFe<UP$?QUn}
z<Cw=V-lk%#$j6sw!}DJ~_G=R8HY2$J1;U;Iejc&27cV_@vLJu<I!tU11$5;D<&RG1
z3x6womGLWSQ9ao4>w*=h`6TeBVe(F!@Az}~)ymCTa6*fevsnw=fB#`8g)totb^Ty?
zOpqY*Uf`Q<2i3#ra9`YH_;3KP-Mj=GA{NSR1OQ_50sF~M-(8PhtWz$5m&ygff)Lkn
zkAtts!Gf?I>}T0miA(46F?RYL!2bA0sS$0{<?oBHI@63#%Ugja2R1yWw<(O8rAFm(
zPgG>^NjzOtu|q6*(fbU1t{j~@dm5{EWhyI7c3a#fexfJdBI>;<#7BJAH^<DdCxzKp
zWTi_v?FsJI=iuc?bA?QaSfSHT=?w06gv<~lTPT=!8aTjZdP$DU%m{YeKE>J(>g(d>
z-Guo-)n=@hnJ(e&a^B>`<$C%;!1tQ6-Z!7N-flI3(gspEtXZLwZ$7#oZmK-qKIT3}
zugg32Rc%!jINL^X!BY{{DEg|`w!`u&0uWJd^?85%{*kHh^$C~R&jh&*FklH5aNC#J
z9Ed97r0<}*&gS<{Lc}CugpN~(R=EuPRDL-RQjUA?SED+$^WkLHz-qAw<HNzVYTCmQ
z`oXfF|14`4d23UZ)tR{ty94V-_YFc8Z$Fe#Ac_lYsA%?EJb~VQ`+U;x<at2Vb2|NX
ztY-Da%Xc*%PSy)epWkT#k;tR8h#qV8vGxaKCTyT!7l;MB3_qt9R=nq%TN`3<dB-@b
zIA5@=5VnS(o6ji<%ag)1vw>2K#k;qJ*Nhg{%JpC}5#ZK~igtKceVJ~=<q=9>03Meb
z4aT4wIV0HnyC;e1;@9K$T8aCiOv1@Vn{lnI29iJVqDFmaf8#~JEGF9j;V?x_(t015
zgjrSZKo!^@tNqZGWOFTfhOrRz1LPVUE}+4e{qa(Jq0I#eZzOwF<8-Z23-7;mm;U@X
zpje)__O;h;m8wT}{DnS>h#!=`%MaQuecC-<)@`Wqq%O7lySmh2j-m7tAp$qvQCK@4
z&aM~xfZR#NPtn7%(MCPhjbpdX^9w=uIT`!74Z_cY%^g%b_O0j!R0)zE>lc$DGs$hs
zo8J8qMlCU!eH2bX_5AUo3kE=tqJS*_eHzTl^>6MY8pnXf)<VVfq{#v$@P+qU0Mgd>
zSZ3rnYnf^}Bduau!WRxn-OOY0y%sSbAVnEv-u$X?KR1j#?^ni5soC)4$tHd9yy7XO
zN8d^`n-K_`vlKolcsR#|Xsq^t&FQy4j#>M|#qJ5&9XF~+w6=o@8pZ;SpG6b#>K_>e
zK5HR@@@~Gb+yi(}8rvRt@lpgIvTMN}Ht3B7d<XKQp$X3YXP(I(t!ckC!|&vZ3vqKs
z<7H+{rs9?Am8>SoLXZwalUfwep0yvvAv#BEEeZv*T~yn69nsJx+X_Kz-xFr}ZCy9l
z(*!wkfDn)}Et_ZD>p$Kdi*`*wO4bA~!J=z;Gbk3nL_Ks9(2g&*SK9PXGRZp~B6n~G
zWCe*w1bq*Qr9Hxk>kX36Nw}?Kc7LvBAW_;u)Yk&WLp$Bs^jdqF<j+`|wnrr~juTU}
zky?jK8fA4|QUQV$RoOn!IL7eWwa#J-3v~ulQO|F7JzwTYMcO-z;kWb7ujt;ArX(9i
zh=AP%dt^ezjk6M_GOQuoGn_<cpp|_AeF1ZF{-=wSWY;OT(V=G4MW{pQvqM23o!N9!
zVt!od&=vb&2@peLYL0;Z1byZLc<)eNg_X&r`&^qvEHaiX28GPaEMJ`jnNSRJsq_XZ
zWD~+No`1LipxBq`4$6}|7r$SSdFwHV&3g($OjEGxttTV`-#0s_e*bVNhBwsl=1*v^
zR^$WIiKxdOJGb??KV^afKo2K!DbV_KJ;==}W$uMU5uw~uUw1ec%1;!yuY7g{;suPW
zBPqPHs(`g&A9{COe1CH~scGP=bYW1*WVF<p4)U|Yd*iS{25UsCkO45BIOFJKT#$+~
zhB8KG6LF(IsZ94xSs!q6VKIytf<qk6C}%2MnNY$Y@4e3BuJ$Lt!kLZ41w`PoDo7$?
z<m|pJLch(@f=TTStn+gHt@7C3W+iIlz`YfNJwnIDV>4T-91dD$D#Zm3=%1HYl`l3R
zU7)2=c^l&)boT&+r?z+_L@ahX6|k7<+C%L}?i?0c6GPES#k93)NhDG+Gh5!?@z|9p
zkCE3oPg~C}Ix2V#YnJ64y%$uQe^uCv3EvQNe`V0ih%*%6Slscb9|4a@4t+?ew$|zp
zsdtzKT*o&V_^dh-M3cqEz<KgGsPzDZni;e7?aRxlNDVGaGSi)1Kxf*pv~i|5JyfYm
zM7~6B4uiDd)&+ZmfH`+1CiwHSu;7!>6mjA>#i`x-0+m)dDGCu|k8NGQQf+ZECy<hm
z7-wGJH&v$NqUc&u%huK<&8^z@M)p^&tot|h+R(Dj+$_mLHel&s$L=G#J(NUM7C38z
zT^$$DN@0<!^xn6BMr2%>8It-6?03FfFKTr?-ZfbiSkK=Z;HL(~#PoUK#yFf!ev`~d
zn{xf?9=R0P9mih``++CLN8z<&-`0Xk%;CJL&z`7dRUMv!B@*Q0OtStLw@}AQXOa+>
zjCC5>@b{z&uc|Xk#UH#9*w@Q@9u{1G7lIqCE<y)oDkH=aaFC0i{DcA59RAo*1A!ex
zrH&L)Sw07HL|52|kuZdC$8A^o+V=>eX7o|Cnydym^p=H((0jBrb~kj4bvdT%zLQo?
zq%AIdqgUvhoFS;VIgK9AJp+N_u}(3V-E)h1o}6;u<OgETy)p%ykGXh|hDl&KqsnJ1
z4U$iH#?oaQtXOwLbmG(&2)#LE=f>iXc>$}-Br_>t|HGR1*{-fnsW!Hhm##--C8WIB
zWu1c(SU+kw0uRu%1g3T2D|ThbcjPpjK@<zfCPq4UKA^B1SE=Y>1hA9KOaY{ku~|4`
zWsEeg#AYlHOa*9hb)1LW7LUxFXiwe_9M%&3w)br+E;AqLRw6)HuOPD&T-q%{Dq?@W
zqVvCcW!jF=*y>rbt|$^vVYv&$n~T2J+o-bb%qv0i{HBTPQ(_;LZR0vts;ZIg0Vi(*
z5)=z|Lk!HWBGUhGuT0=EtI`Xb0rJ}^f`*<N-ze}<jJG<Oq%N}P+>10Wi+p)sTco(B
zi_gCPAl4(>KDY`MewXTLOyAABQs9Zk__l;{G>s$k)hTHqCQ|{Sw^!Hcg(8U^h08Bs
zz6@Rk9G8nQy(KhDGW_Ey#e3XxL9~j0s|!(a>-K?1<!wJF8{5snyZ#lgtfLR0<jhz7
zrd?kU&P<Oi5Fjo=ZW!yHJ%|@C&4w&!rK<C!J;@jQP%+k7N3dJJa27YMFYW;#s+5rZ
z+`j%%tk@nexGrXQZ*LV6V-CFo;^ISj@c6it%px9pam)fVS?TU*i5+#xvd5czu&uEx
zAj}RRm&)^RrqY|ZQ>Anf$*dci=0B9ByN-V-OR2YwbkQ^<=#?{{^O`&M+Wo<tBj9Qf
z>H~k!e~;a$Q#pGD6uetJ#roi~J)|B!z*AL5*$GQ%2hi*;CmYhu2B%#}Nsqt~7vBAu
zjrE7Ll+zJ{tqhR1W!M0)eO3n^m&^KWa7)0c>&s>~wr^A<n>BiO7EvX12H>kQ=9guV
z==4SikbTqdh3RH8xmn(;!LDB)&lM1xt6jV6;k-4-6w6$N^T=&8OXQ%O4TTeuev?p>
z<<t{YSA1TI*SZ8NT&0m$Fc|MqN!c0FNZ{4nw;P5MU#xXu=qToG7Vyb4foUPus!?bU
zwSmI7t{e4<{#FiC$}G1qM-)bgz(1?fh-yl8Giray*S*S1!32{{@;Bzy?g@F<KsJB&
zf@KDR%}Wj`%tR;W(aR#?u|IgJT2zL#&9n7urvW@ib^N2$moCC$JTCX2&eC|~cjemh
z3*wdS4?<6X|JQegX+oSm7Hw<`FWQODdK#U#_Ed){cnLvKq7{1s3fVSTpBf^4W5IiK
znHTyvO)T);g@C$Piqdx^d;s?KW%Uq-<u$pee*F+?ghK|x;|!-g_Rf>8lcaqtCbLBg
zDr9s4d7+t=*-!0r1s0Hjn0<1*I8evB=76M3h__;>3|Uh&X7ukE-qau_Ly*e4%n&4X
zC4>VqCi~|iukJQa&6s>=kJ?OIL*cG91eK9QrZD{aburp*way1hXdqLR^ch{TG(u<z
zFNB~0^dTha{nbmb%Z9!U?yH(vK)DfZ@lIW05u#RWAn#7N48?sa`^K_hED(PG&Zz?+
z?1xvQXxd#soJxErJ3EU`Br8L6W6?K1cUZJ6Z}Ot^!LZ(~yekxZ{+-_IE$L6J?43>6
z2F?~m73XRqFnQnkU03*5emojT>)S9t7#TTJXj>dOr|df~d*2~GX@ntx(hp111)IXZ
zJ&Z>7d=`R~Qd>=?$Rs$Du5yHt&f}SJ@a*%_PsMWd?zNqzW|u&iooF|}{Zuq7G;<xG
zDUl79@BB<)-I~D*_+_oqc?_?53H|vjR|AmGh=Ms>9k9Yg21eZ1V)&7K%~@MF<1?QV
zoKU43M!)mycZa)yUC3Q$>kx1h`{H(IZ+|22?Rg<`M6<@S(H@CVEEyV&UBAJrBUKi<
zVka%^B_zT5(jZH-EKgjKhzG~#rA5qPk{V2YuImQLJh_dh)QZ@d<K<Gb{?Fkg<{kB+
zPw`nf-)xa}XCQ(<8t`&zFfe4xq{{W%k|GG0-fU}sB-V8l2a?0Q0suwU-D(SGUx_Fe
zqzd+x!)2LRSKCrRST)}HoJnki0V8W|?v`GAw)Vv$3l-4K%WUszY;erI`Af?dLeoM~
zV#|Xl%M4clwSRwV{qG9Z15;kjDrT&<6A4O^O4#qTwt{yN+>Yrds8zYW;UKl#Km=);
zt|p_4@jBNgBiG{*EKx#-f!}`^OZ5{SwJ^AzjHPMgweG#&IG3uwhfRZFKGG~XSFM}+
z=@<qqEUov*zPPCd(WTh-O2Lx*Xp)pa^8)Q+i;;xh#6)Zl54VVKw!UZ-CryK`MVYe>
z;n6NKm9y?${8GIk1U#mU$xVLuW7KqGu#j8Ybsk}j1l*4-5CAnpb8e&4#=|I>PjsgB
zYwa^L?s*+zTG|DF!c?%~mOlAQg`Oez#u_mZG7!en!Wcs+MhFdmA34oX2labN@2ppO
zBjDOHAB?sLv9s3O&irB|8zIMisrO@pj3w6cyOSUf($J9&t3x%lv}t-4%2WXNzL-6b
zGNKbY_sz1zazdic@PvBa`IguL2VvDK>E;;(sq~s_6HwTv<iJ$UQPlw%$a_zDRmo@s
z`|OK(2u37KL)*b;sxzeU*=elrb(mr$%B*&ha=OwEniGuQgdu$SupcYm?UL^17<Ur}
zdvi88BOPx(XxztK%RHbpjAiN;v?e0IqfjFce-$e6v_N+_J!vE7cYBwCy2u_ddLp+;
z(?He337AVg%F@Lx{O&)qo?U)`Sok6EN6+wL@s0&ta~9tUdL1^s=rHV}zJ1}0Rx{xh
zJVSDe<ugEW7Lz8@N`opmt3ULmPw_K@W(?`1hI}%UGM+w*Y@eqBo7%SmADTICIMf7(
zh~n&;Q~x9q+tc-{%Gj@P_{>xJu`(llfzttrTBP4S8Jv&I8w%1jpEROxPJfL0_BM%j
zuUG&6p|l4lCa1U1X(kOf@ut`Qvseq3HbnUtf=Pd_{59KeB`b5)d(OM;@0@fU!JKk1
zqLA%5o#HJa?V2d|Dc4!T_tr(U`)ybr&u170yvO*ce*29%2ZR<l$Y@5?4J@|XAQp~i
zB-2pg)(bc;mrL7~1@#zxr@3#~Q{2~<rm}%bKd^`M%E65N7>hz+H*!#|9lQ!<uDCwF
z60uRD9NcBiPifR8%GcGPU~Rf`psH*Hn9No#ltbfQgVmtl`iNiqnqY1$E34z~aOp(M
zviWm#Go5mhUdop|A8h}BrC>*ez>iMC*I&BGfWnGP-yg<!eX1-BVQ%+#(ctqBLzOaj
z2K@#%{=n0#{IFmGwSGQIV=i4S`>9!?jom$`uUQbz;&7?~HPA^XdTb{VHNDlJP+L&I
z{`FNozR}8}_8znX;&9Z({LBkXF$2E`7OF!l<8i;h#pi@rh1UVk4!7q}x})9-d9v$f
z60m4fM{WF#rvwvq2Qd58oW0_?@DIOwXD-$yMI4H=Unn1+Hl*$oeR;g9?5Q8Fb=bWH
zyJ$Q9bHvE}vK9E2`eYA()bA@oC~A%$9C)HN5rA{uD;De^fOXa|XD#TrF5%@pqNT4z
zD5Yo@X?5w;6D4h)p|I6Macln>Ey3a4l46#UiAzphHKn`53(-T8e!!2%ZC)((`F^KE
zC}!sec~_(TLS5?H8b?f5to1^W_@_QWtP-sW>I7k3o}n`pd+l*4A+-uaK7+?yfpn77
z9d9bi5At)9I;|gwP-5WfCeJi9Re%J+$hkkHr#lV<I5$Cu_efdMRisyxtFL655TO*-
z*J3Xpt{mWFK_zw@lu?YW+eEsmREKZ+S_y0uWO?ULs>e_=4zR%Z=iO*T%pkcjkuzp-
zj_&rHhj#_W;ne|$MfZd8^!ea)-bqy7RYl##(^MF+=}~<sc&B-k;Kr~O2{?QRB_LB9
z5{An8eFmG^4|M(RjKChSuY&l8gt#=+B0?hmKw;+!lGzH45OO(1il}{!Nm6otea#5*
zMZP*Y;I4V_m}tF%X1AR^tIgZ#hIFy=yi>tZ6|<#$gY**k2|Qm>Ex1}KrIwbG5|5Z<
z#>Lxz4f+a8O_}G9j2cmL?~n28E}Kn0biSo=#4OqD_$YaIzU}{E@2#TZ3b!@w;O@bK
zTOeqV;7)M&!X1LUI|K;9Ew}{_?oQ$E?(Xg`y|VW?yU*$I?;ic%^+j`448|yGeYI-U
z`sRG*`zW&{5A!5%13%Es_K@0BPQZ-qdRV2^!KD2t9Jil;P-!ZAKkUuHeb|Q&*ebXf
zcILM|qdL9my*hjON+Mj=P;I``zFPcY7i)(*3oV0*x5Uq~T+l7`-x8xkQyf#9%coSs
z&-YN{KFTNX*nqcU>99^{$h;<ziRagh?MGYOO`%89^}-7~)L}|+FxKy&I&`B>s?7}J
zk>-P|Rh}P0;7A(Q2p2DU<cumqiRhsr1!z{L&jRESqSxfoKC3Y%(5fiBSqJ8ViY8fH
zU%iNe`Z2Bv-$!gfa>Ja9)Z1MsS<_FYSo@MOYJb2z@;bCsPR-pI+K(aR8oqT6X=?`6
zvmThLbi9Xn?OUhDfXJD#D+|L=3%-IL&EWF%p$;fz-GgRl^V5G$X7{6+gWnI&78-UO
zt(Slv2oCqS9v9lGFLPeHt@64M9*F-Gr=nC`S^6j5p3aB#xY^q&Sca{A;CRogjLmkj
zyYi1;wM|JZe&{+rsX6gaOh}q86FL=^Uted6Rh=Co>N;o1d@T#mr5uVSzEU<@i_bD!
z_>?~O*uyQEJ83z5$NIK1=a>pjPKC~J<3;06?|f)EVgwggNp6Bk^XfdOSMa*gW-oWZ
z_Qi~VK;ZQ{PE&4oa?}7~i?t!)Q8B%w?JP?Mc?M@AHDptNsm@xdQAsrJ<9kFcr?)cX
zy#$0x8y9~bv$s(5sjVgUF8PI@QR*LhLR9MvNaf8AQ($0PYMwyz(7%M$n1l90VA6@A
zLzbKLC&-OXuCAw44M!*uZtp9QL%>xNOp%`zKR2-2JA{9!#~nrEyZ5`8if~2RL5F-!
z^+ZXn_cDE^ve5;PRvK%94RS7;L)aX6(?n)%m-XpqN+F-COPA8O0V4{3b*s@HquB;*
zdau=_6S+<&3LUFRSyh;yK?nT3aCylq)eKdPdx$hH<L6*Jh2I^x`Mwo9<8N+WcU`Rl
zOSpkia`&ww?e)PyYNri+z1-(hHw4Cy;mM}G@8~0j66ua=j<PR97UbY%oq8c(oh(|D
z)6WlrQIrbxg&Y~@y?qtv<OmexCi=XT&W?WRL=p4R&J_!vfE@#8J68Qwe>Oekvja+v
zkzWj{k48HB_nR7|(hM`n#D#RvB5JjaVyO_)JLEw~$y3t57+Ar{tUY)M_j+5a;*EG*
z9!}#z+SdddZ#6ln^nVj^z0Jq1bGclW-TZ~3RQWgOOX}BLyo3x51fcifmxWgPiohpH
z{C^V7u4wg|&N@!wh1RRSxefo#O5cEbGJLr?Sm-h)qC4Yv9L2cK^TKg@)Ke)Kvn|Y{
zR3j(`iKbf}O<RR~+{1CshJ5x@)#o{a|H|+DBbBTx{o@D1<FaWC7&iX8z<po7TLXxY
z0~?(ik8oqbcK@u<Cj*tu@$9ZxR$CudQCK?Ov&GsakR<tw!XD%b-FW37HeE`OvA&pf
z(2U>(=2KEBUSDCH1O2y3!GbEm4KHRWdR&j2eMXADaEy``SQz^;es9&rgdrIwaME35
zeoeJSAncWA(x3?F8k3YTNBMg%@flV&5NIlvlnGe4^m`mjtbP=YB{)s<XG50Ypo<}s
ze6-7WlPP95H3&oWK$F4l;A0MgIn#Q<9(XUH4jr?dBiPom+w85T?%6$!?KL!GG4l(E
zLHHVryd}mAfjr(G!$Ge<&X_gY%Z6Km396GZzcS8~PGn1;1T1sO4rm=%HuR*+zZmh2
zxU}+=sHV(2{-oKFM!MvxF1hp~7WfU(+C!IO<}}n$o|K3y^P$CdY**T`s0p{gfWY)q
z_{5+EY@%kHRu8U$-y%N_B0#yQ#pqq3uNz$<>Hd3m)`}yCGwCjvERs>HFX5u$j)87A
zpgT5J(-9ZK=e`2#;29#=E@tfaC)_yf=F>He&xMF+&`jn=@9Hh<PsP;QZ|tyCv~cCC
zAsmi^BT~bU;H^9nDh?Ibg8#f*CQH4kx4g@!WJK!2pIwh0%CwvstvOMMb~(e~i2xxy
zp5)F1?W5~QggE$_X?oglGejoN9h66lKB5s?!B)-y&b>4~qyiZ~^1^eYptLqfHEYq-
zXcfr(c|-eV0Psv+kc;+TbUn4g36;#*vr!KsTrGagVjWv3vaAPbpsAo5iT>D=Yq$%6
zA&^d>DMK=MQMas^+2uL?z4U;DJ<%g8m#Uh^bHuNWdVMl8CF9v!?UC#pAR1CtyhQjc
z%++Y^IXt~fpInVc*RODQ<7-9JT?vnjbKB+0E{VO_h{T(3Zo@H^CejL(yR*g-G)Y{{
zo5M@4Px_8dXsL%kh0W4NEt-Qm?i@VYbbjzrb$*L$T`IqRRshtE8u$c%2wtYw3^?(!
zn1@2z0bCOJ8bsb_7)+6|vrT?PrpUoaXTImzMS&P=0{Re%f4*2GcU6dm5Po|b$ALz|
zbMvPQG}Ek|?D@)(97N-JN21@BFiNzPI-~$6$c6BL$%<&t)L<>B18URe!s;8Uc8_5S
z+==Up<ASKw54(f>m`EORfu+leDjGKkNpGQH2$=?Pr1E}kzmn-`y}Kcr@p;|F0a<*2
zLM+lDEvNH<A4jxOD2RVc20v1HW^s@h_1^Px`jg%i4num)y1qJ3lAJcZ7i37W1d^{^
zn`OFI2=^W(x+#>Leq}`{`%gKUw;u9!^d;{6AZ0CHFv_fd#nJ=ACH&r`Zh)`H?FVd|
z^gW=AXE)2(4%w!ko#VpZG{!CNg1X-uZO~=Jb>xzXRD)75PMgJ5vO&*FhI5MmXs`H!
z@2t~b+b$2GukhS(ZTe<7f3So(7zZ*a|FDEgtIe_6z-wb}5g~v~K;ks)+vL{`Xw8fW
zyZiB^Z-5NFjeFP4{G9Ez(BrD$vLaW8u_m1y$m;3^W|y5t3K?VQ-^?V*9f*F0Z%5!B
zEM$PX?-4dSeb={AIAldQZ96p60Y>>2c46Ic<n877berz_8r2hS$G0d4iz)F9azbg$
z2qV<)62yr=1TIDF!(2=Egutn_a9#YG7P#!Mx0GQ$-!KsaXWybZ!Y$YA=NV)XRs&_R
zNUj|MZbBh=;C4FQECz}O4OSlfQ9Vb|bvz>q){YI|1vDFI)RumVP8}PXHL$BJg8X1u
z@O(~Kx7-z}n*vJ(I{Ksqm%<$krw9v>XblO%0!ZC9!hIlcraGKySfjy}CngX$xoD`j
zuhA$@O~vItIlXeXYiay^zj(g~(PEGwRgmgqz|v<KOtt?O=ts+d>CiJpq)rw8u(nLu
zEwGFh_Iv(WsI9Y;1|HPjnSR#>1)IdevT+kC_tSqc+mWRm7H72*slqL0tg}^s{&CBu
zSy7i?<F2TY6=%?n3E3DIOB5d-cRH(^KH9V~>sbE+nLv1Dj<G%9i7bm=zmk}a;JyI+
zkZ9fGt?koetQ#X8?otpJkfEtdqj!&!LzX3A*qG%5YWP+eXWgP6FTOMuv~2vu?8d}t
z`!q8u+LS$LtDJE{F4{o6Lp~2#TrTg?_37=_dr7ZG<kat#YBdi9WQA!R7!mc<u!P?&
z@&QCM)7x=nm@VK6e`lq;E$!;DEXM_{lx;`p=xsjV*QLU8$F^003;iQYR4jw!;&-y9
zmQcDeg?<gX+E2URh*w78(6Slavih<J%HG(qX$QEclrcEZ#7($YcA0l;z3>wn5m6><
zP%tuG-M^__4u9}OJ`rAC?_T4dPTAMdsg|tE6h}I;K%lidVC7782nX|=y&vLIAS`VV
z4MI2Szz;3Y7c>`*O;qN_bU)mdG;0#ex>kB*j%Sq#t5<etId6aJ4!}A+xW1Q@qqRKZ
zU6kgIX&JIRD2gU6nhBVYD?-Fw8v^F`#NclkJSBl3-BdqBZ-?V~W>#z_h4hF5kp&TW
zwc878{h!+-=9AqQ5hje=0+<c`(6JtT<$(<u+syii)J8q^I{wRcL61TxyR(qakX}s&
zO@>+Me;AbT^<F30=j3jBZs5>}<4q2EK7$l`{I@=SR7eAQ^oQ3+cmf^zaZ(hGN3?Hs
zT+1({IkgHy3<T%Ab4P}7vk`%Ok$P`6CyipVDG1M#5qRt>?MEra<v6~+iP6QX17xaB
z7#+kq&EI#{M{)Dv4W+(S-14KFrE)xl?FAGg4$)#J6S((T<taGx8?P$hC?&BY<#!Cm
zjF<_(id|Y~(Ra`e>!?}l%4LeJG9f)i%sapzmFvCaJ1(-FA2BwcZt2#&tYyA^p@E9t
z2xz5neU@lLP^BTzm^+DENmrx1>$wZ`AsLh;#pj*oX|FwP`?T$F$MMr^u?oeVXieVh
zN$+&8C26Uj2gj^4fclhoEBRt?4Dr|RB4@GRiLTzvS0{1=tZj0c{8Y))RLT{8u2E+c
z)xGv$9@l5PF<9bwrj_mnB5Z!FoQ*HpmmTD2BU=1s!}Hnj+O4Lg$(`CN>Ic_ooFp<D
zKWYNfujJvL&}{+`I5|m5AT9-5Z;u6<I#<XEQ9rDo3%n4yLi1%RP~RRRx@BJmz%U7a
z!cvqpd=wJ?E^!G-MiWgMkNzORX#Q2H4!A~mKXQ28%aL36Q@AJIu6FoPw7OO}E%z0D
zb()e$Ua~_x)s10853(~sTN5*!p7Ou!HKKe6DcPFg%JkK*C3w)56Q8c_M2a(+jQ<7#
zc6~X}&W)qbDdA4xa;6>*%zA2v&Y{pCijAR`j3;q>ZJXM)f_7&|hb1y&`)Uq1s7I=6
zVNGfAl7saA`8Pi`LGlp!fI1Bo5hhaif`IMo7<T9|N>3r!VLF|^;rmd|Tc^|M5!0*-
z<y!E=s7f&E`!e;n-`N2SAxHc#Dld~H$RB2OTmr=q9Xj7$9>hg}%SxXvmgGWU0HcZk
zW9-xKwC{%mK^|}ovpVvRL)&UGKg_Px+ZOiZ{($>=-dnvwjD-ulMZe!Et}(a8ehxyU
z5zijPEJL}oTE}3NE95=bzN8Qw`s!VPV37PR4Ax7<dTOTNZFZv8>JY6Uo_nVj@*Vxj
zI&Ohjtpz)`?|-$HW5&GOsr>pq9lkz*slu|Xt+(W!VBZBqYP;qSr6lByya<C1)wA_K
zE#_x{kz7R$p-ARBGK2ZQKGA>kpp!Pif9<gtod}9=r23cH`+s@hzby$^ytf~8A5{LW
z06lsg_;)gPxiZoJ)+ioJ4S31p_B?l#|Gf-601HsMcf_y={aZ)6M8@BLlyR0IFADkZ
zUJZgA@Oq)e2ciG{VY~lz$Gf+w^JD*8k2)nM@OrTbKVeb+FYEEIU;mF6MkUvLGFQ^B
z;(xa~1pX?3q5O`jK4kMM`d@nV|K)+86q%jvFZ$5i!lKa(zSPZxQ1I(*DscC%^IK>y
z;=jGK|J8O+`T@Z%rMj&ai(2mMR<lvYzh4(c1cl_J!UlKNfBOdf(ETf^KWjD`{eRCw
z`ma*~0`vdIb&-zcS_8lNwY;2q%B}8>rZRbgUl$kTPYDX-GJgTZ2C_zhT~SZ|%rWY?
ze3`~vlU^{gUQNhNlAY2}RG$^=A`IsJ+S@Dj+VK+g-{?}9eDu4jsRG}`a`1!LD}Z=3
zP>&HkT#X6r^kw=Wdb|QuX&m6&7%B(9eqckDkM_fbl+4b!iyc$2XWYgII8!E#3cG9>
zH}Bu;p3=t^57lGFJ$-aoPjxMsVeeMo%2t|n#4%m2nngm4dxtd;QfZVa{Eg+-O9r)~
zYZc2q5emUkulGj%h0zIe8NB#xsKzZMKAhxluTQq)oSeS<=Nnmjf$8mzUY_Ur{k-I(
z6FK4{gFzjW^?j|&eZ4!jmsIFq2D^d7)@3mr_^?+lmfkdQcv7O-stITnF>A+aoS)ba
zmVEu<-T=3GKd{|Qd&zC{M`exAzT=D68H_LA=W6zSz6jx+%P(EdJTVwhFRb9zI4BCE
z{p>Lc!u)x^lTp5%BN`^WyI;TN6Xyd3I{=5o@36VyO*k#aTp8etFIVye+bpj%Bwyim
z>GljyDDWCK{bx>)c=)1_SNC^3Z!Y+X|NgVu?{5T%xO9vMXQS_>B}89Xjpywg3Bl~^
zo)+T`q05O3&Af_LHO5V65XrZ@TDLV1d?nVFH=52=Lr`dvWnM6p;$^%nW~VJ6`_*(-
zapL|Ak)JqBpB5Z<e{rLHP%6R;LwBY3|5mhT?Zb4<p`eA$SPE!4r3x5VjF<L!wvQL8
zH-1y!KLi0S8vT`NXnBroEZ*CzF)>t;)?W%_Mg6lB;$X_+*%H~pzMpZU#0k|+x5qn@
zpD;`KLgQ%Aq6Jw#ezq6}4dh>{#cWbh%6If__UoLA3*4``Mz+NB_zF=l0qF7)ktjc~
z3qizjU+$Me<9+@S)9P}%xYI4vI-BRolrz@<1R`ssyJ}suC8oi%s@e!k2N)Pmy(~M$
z>ei<rKH#8HZeN6LyeM6DKV7>q3lSHq6cbC6A;7H1n@`F3esT5h3)h|p@=hNf_O|36
zqan!59^XoJt9wJSPWk1&+E98zD(1-Wz78?%njjyp$>+;vY(E3K^)DK&xpN<7f4S}r
z%d3sOXF!7r%L2B?A56Z@mfVTOpNy_F4ShV>FW0X7>SDD*u5s3?4*2NtiX9?pI*jq$
zdj=CGZP&A+<z0pt>ig&PPGcE~c#_-{PF;>CRqIaf$+w+(B_Su|vb^_3FV6(t2J1!$
zcGfvO^NgNcCY!8zxXw@NcPAeANo4``#rA19{s=#`Ff^!95MH<WR9VvVs1(z2IxSXA
zHWg`cbcXKNfPbr}@gYH6uGyMT<PB$AIUw!qXQ*GL&$I^FwzbN6tkn_m45)kFrF;bx
z&WtBK&M}E%5%D5V;28P25|#k-H=#fyo6a}Nb@FJXL-)73LBKb^OPF3X(QJm6%x2KC
z?6wslw3T;_g~$7~`R%A2OofLyI9sU*Ybg)xdx;S!Z6T-3M&o0Qdi%et_YSPC@a#02
zWimU=<&UIp@p2lMc|?2|s-4qi03s>lY}=L@px~}?kuf!>mSQNxb3IPDC$n^Q9xV1y
z9}JJoE+ryE@@xXdMy=wq$H1g?>gDlyNVs^7Rvpjom#-ltgVI~N%y-|FDm6zIm9n4X
zs%*z>;^bV2(93kn+l+efj-PazZPEt>wmG+Lk9uz$s&9)wzq2acFqocbUVG>MGIR0q
z#kp8*@J`$(diCXrXYBdvVoQlsPgeubpp<}t_F^ih?&%g~Z-lWtsYQG!I4xCEHQ(ve
zCykbdt3+zoGRmvi&j}?!P_Jn&xvPHw%Tu~*gep8O9W)5kaS5&L%WMIGgx=lf3@^{8
zBRlJJ846UGhSJbDupvzsx@wdHdJrZL$BU|&li5QL1Y{DrUi3}iEPnxcqMSJi*`o_b
zP%n!Um`fd^X?iu)Kd-G#KJ0T7R*yxQm#hbZsiYGT#KP_QE+1ryXE&YM7W3~0i?N2U
zjGPdr+}dZ=QW}`VK!0XSf@E2}arO)zV2^~Bv3~J<R`2`f7L1H<ywy~zbYIyg<Hhu*
zpMCxsf@Hkk?teKBU^LbTlLO;IP)6JrmSa(yZjyA3z4&ZSN#UL*Eh*D)-&UiD21FLp
zd7V@6xw_!-RtY{#<PH5o%0JRKM#*kwpQWK-Utck*K{+dztlq6G(X(z$x}9}$Wa>Ux
zDF;tQFR`_}{+{S=RRjXQdsAxMdY1_Tf|{4NJcCU@skcNU?%HAIuN{MFBT$2S1!XHT
zTw;yTSi$ik@20e?Ob5tYJ)9D+U6;w!^}K7R%K(kgy@UMlw`-24+o3+`M2WC`SwP^@
zXj#u@F)oM$gmUM~XRdPHO;s9InH;GdEN2Ju+P%HdzSE=I8I1qsuoxWad83MjCT!VP
zXKgZ{C8T+`@`*#+w>6$leKw89VY~2w&-pM{@N<y9=T3sU)UV;>Jf~1h3!R%rM<$!-
zcko}8`D~X$^)K-<sMEqtmT0?Cus?gFTEFWoWM)dXZXj&83ZA2G#W<RbH5nS1+!-2S
zmWc_}hI<$t3<JLrQ!tSW7uLb=0lcdS0sK%RD~6^>(^Q`f3f&O-iJjFrjxfLTQ1|IU
zeMbfg(TeF<O*g}Kk;RNm^N$l?+B&(0lp{d2+APBn*0QU%GG=hLfdq0pPQw<wRdW|j
z0c62oI4Zq`3}TTBW%wWaKa;OX(8cNxeWTGZ-T5=qtAu!lH!CV0NqpfHrhgJz5QZNL
z9g6_rrNxl)B}R!#x<7(rS$~_3X5#$h0S**L!g*dA?|UgZ;5Mh@@NzRJx$_fShV5we
za5$Nj#b<eF^3P7S!;~zCBw#d!@jya8Tx??~ApklDl;Wrqi8~{4S`K@|pF+lD&4Qbg
zHEG{!-OKW{U00kxrl)-Qlfss@Edqc0!l>0I?7;h60od*Ox6ej-&f;IaJyN}L$^;vL
zNV}br<f%NsfB5mm^K6715aHOJMsfD3_zeGMG2fL*Hhd9^{Ce&{f)(?o1ns}C`0?=l
z=kE+!{H%k)n$84xJM#i$UMQ5e1mACZc~#%Le4A5iayg_dmq{X}Q!9(bdJDTqwv!4X
zqe0Rm<m*+z{1I}W-sH@xaWb8qzU;f^ed#vwo8_CJfU-uA2W+@ZxU#5bD|c<4?O8^Z
zPHFuI3vsN@k75cI2Q<B^>beFciiguB<|UnKrMg&`+J#RW+I1nsEQ2B{l<XxcCZx3U
zK<#2t1j6^^s{Gz9p}mtWB(g^E(^Q#}nfcP6gzLjqSsbsL@hdjM)YWGo$+Rq;M+)Fe
z!;?AV6@VL@%s9Aik)4Ql5X5^=g?@7tKHdP6UkhZaYM74rh@TFnsP=++)ofIcG9)^M
zWYIPNe%pyLa7KuIMd18IXb&W{>eh6roG;X>O8|U{bp{#6*Pd?$JETbyOxhcX5t#JU
zTZB3vUVoBIWG_YDpPSP!)@uHRyuZ|EC4zxm93)QS{#7i}t%EQy+Sc1~(Sh0a^=YzI
zOT9=VrzzzcY`*H4d$taAuyWN_9s|w_fOs_1CTBh(gRkv?x97o>=)#gRX))wHhgQEA
zeB1Q=+IPm{bS|izFPy-o9e5>s!!v);gPT9Wr1K}<0Mon=%AENtwf_(kSR2+NJ&9*7
zXB5io^1!1y9cX)BJw+n=7AilBq-%c3Yn?Sx*R>ZdQC9z!0*`L9Vh}pJQdn+~ki_k5
zQKVb?Cs)?9SP@l!@M?U6sh`hvjR7zpvnFB#>EES#JY16PV`%vYJ45M9T)ilm@BF|0
z*|X-skRk56`W{<pw_2w<kBV*6{K7tUN(Ll%$fusrU3&%z&?EOYK`q7tHsNHEir19}
z_popxJ}FofLjCQYL|tOD(GCwL6|L{1pO=T<e^QYUYXK}TrGZ2HoSrCnpT!;v1?*mo
zU@oNrKdMO^Y8v@>qrf`!%Y(H)3V}DMuxy`SW@$wN`3M$oa7I#-J)JDpbPs3Ctm_O$
zpxhe)f_1Xh5;wuMUK%$YBLKZDB0{B(J@inZpE}9e;b^2j<@WWXDtr6l66!T@8JOIk
zD6Ws;TdcD&xqPWYEhLvM@Ax&LUxi%K$DD;A{i!8~=Tyd)KC&g3q=fInfnh6J(1G+F
zKKu<sye&}Hgh#e{?ow6Pp&ChH`&DTn)FAeB>Dl%ub&MRLlqZ$4pRDqrruxLQADDC6
zJ#9R_pqlY#8<Znr8koR+_-%Qw6ra_2p^rowyA_peitnx;z4Xj>_B9BFuJ#O&uM;^u
zS=T3vbhNDvF*Ofg3miM&_k`m@A9*tSuC`o$Kg1^;66!z@XgX`-5I<VP%OU6vvPa3t
z1{?vV=SW~Y?W&F;Z!7KB{CALl%<n|IGJRUwcvx1qh933VGifosb~681-8xURuRf?#
zefMF#K<bNpvAWFRhfe!`OTwui*~<H>>S(n|5I53BOOwWF@x?5%2s@NZt=h(0Z_kD=
z{i=gqE}h%DO!-Kh`X}x1aODmi|1FV#&$1QxYSnYpy>!|OxR9c_kA@N&=Df8FwIL=H
zb25M6u->&?DGOZadpVD^a^p0!S)1lqV`;v9aOSRDn2RBd6EBspcGBD0W$+r&s&lv%
zv7W!49t*^hrE`n>ct8Ky<d99T>=|abjMi%SK{PCJeyUB^`{`j?Jh?h50tg&_kP^@!
zo2}I6u}JtfsX~_tQf$xbaa_%ynhhFoGCQW*O25-5!B}$wLqMIBJ%~P3?x-=;Aulza
z`L1_=Qm>3H*9nOnZF8aY3d*3_RW8PJ9R7v$B)dU6`n1+3*>2nQYeT0Q%x0odK+MUU
z;cK5ODb2t;%xoPl%Z1NrT)1F6-%wQkHMxp^dnwq?#O7wPb{8Lw<aJedlK==RmV|?|
z@=1$ZmbishNZjJtpJ<j(GkbB{p;gue{N5OQgopG`MC+7;<gZpljkj^myP96E->R(y
z34R#ei|7>ng!ZHzWttS{bv`B9BLwylHPcsh>F^(;g5oOA^H;1B5;=gFlCCIzhu>;{
zlx8@b?ypAq+0pk}xg1BB`>9TNTpUJ^1PkSPGED`vwZ1oH$v)vVBIBm<doRMdBeUcu
zW~l|`GmekuV-cO<EVon$Y5jP%3|Y}eTfiEta~n{qCOKSi`k5ViWP?QpZn|4bVws8g
z0({LjWUDVM8lNr;Y}W%#jZrdH)TCRmJQ!jZRLCnd+H=Ed20qb7ONQ*T%53MtFz))t
zCv02greIeYNrD=tf4m`N=QItz3gZtSf&&6toM07cmU8S!eSl96k4zfxLXp;TUu9|7
zAG~D5wWJpjfmMMlJ<GA2Q~SA`@kYTYgRw1jI8Cp8d1yon6zk2VB~*g3w<nz{tK`t)
zYOoBE-_s&@2N-R-%9(%~?Qm%3qq$FJnc`GhAo9;99Bb^$qBfF|2Z>WGI)hV6X~;@h
z{Md@i_ib5&35*QD_{8UrO^@uk2^NEEYStbcM0ww_r}B_JZ+k%>v4=^FM*Lg!x$(p>
zpG8i!xza^~N`hwR){+h?M?)P6@D|b^cT+1-$2|9}zqm$fC|*?tz@#}=eC$}0yUB)$
zvj%Z?YC3Fm_9(T?V=3tt6d52g3Ku%c1qiXz9_<et*~+jN3wA)F{bHOdkVC-axivVi
zXbYIkZ5&8z(Ymq{;T~C&=f<A36!fV06Ux?53Qyuq8G_Fj$%aBeq6~%cc{B1Lls%36
zCB#^Yfc2+w)CC8lzqs@Jam)7UKVvhH^%`5fqf~ed^hymCR<5$mU%d^6xXC+z9!6{G
zxT6%@==`F+%MC>>W+G6jG|RVeRYyW1SO?+%!g%rMBOJ;@Qqj)O34A33B`>u;Nn<%*
z&sUC8*~R#++p6w<T^FmOdbd`J;@tE>`=h>xlo#alJ5{$oKezkS`|g#yRR}<01}e|H
zsZ*SiU1(dsVMzM3MM7p6y0t%D8N)n)Ucj1f;q1R%!go|_TY}aM!!uZ;{nQ~PD>KAb
z0|$)ev>hYdYZO?1&6$D6-Da+}h+aG47m8=CDnR9hEQF@zq~9&VJPzs6YNW^5EDx7I
zOzjXeX?DjDrDF-Pw~Zf#@x3-A2b<;8C?oMegN`Y_8$s0-Ddn-)Z?r+2RHX+K94-vQ
z_`IDqGZizg*MW5WWUw24xton@gYhcOLb}z<szXIw>L6^0+o7xB(S>&o=$vf>^vw+P
zc|7YvpsQKQ5weYw(`k9$dz56&SN(JXGE8?Hj6=8i!=)iVMYKEbJ=)lbucP`tzlJQ^
zZK0fsm=av9qg^m1+)@8oTS}c;rVmSGOkgMCPATf&cU@8_lhH_*jFojc6sA?4A#X8E
zC@^{06wS9Cmp1Da>Lp>9&@no&ieqv)Wx<K$u<&`jJ~WT}46X<W=AEw+bu3aLGnH~K
zf%oAtsY(B~T;b1nJ}ELyBQ%~k@UTyjztV7$rkc~5tCcu<NM6v!h*zzso64!UKG{18
zE0X!ANG_oa)YcLji62c>EC1cAD?**}uJUf{pQWO&GOFE~DCi2bsx(X_2Zv62MMd^a
z<f6w#(wCl7TxN@f->v*Qr{vTt`$#1~8;E^nN*mB*Nt0QAwA7_lgI}k8KYQ~#$Zf!i
z{^Su%HknG{cdoi<4g54<*Aykd=@H}`tgyFy0JAt+G^LPLqg`gYa4?zA+Btz0g^Tk3
z%V=eBS$O&yCk8n$Iz3;eB|an(ZdBZ#ok*<2yr0GQp^id7p!EIEx8(bv<rb_t?auvZ
zox_%NUSY}BUx@?0+M~V|4}Se#T+UTq^W{lh=OEtmIA?1cH8DN)@_>KWnSeib+^(rh
z=a8uf_P%Z?;p+K%dWbRVfOp~QmIIXhbTtN+dGDHnayS>KRZHKINGvi)=GR2f)}ED?
z!D4P$BDWc&;J0aTl#fznF!eEA?xFaxXP}(lVh(TP263+Do0FI$-(%qeUCxwq1?IZ8
zGuF7Gj{W>QUmkP?=9l~u)MMsoK}zUYLp~BI;%X@2Y~vD%3;~dKe`p6}yz9f&F0)k4
z-%zix95G9A8$XxJ4?KwtODIb?3oe&Z0uSSB*a2}80MsX>o)#znBCmzdN|?$Cq{}SA
zGL4yTZPo*sWxc8h&rv1=X+1Ks-#e#?0%|KyIb&=@{P$7qNbGMjt9KO&dD9<wdRxEq
z$T#$t-a8f$Sn<TuU)|~X{uJ2KKcfdd^2$MeN_gi*nF4|8%&~zQg7!wbFq4<)J6~ZR
zmy8nuBfV~ho-MH<eEG6*7U9WQ&w|m0c@g4FL_BLgosdyRn>uX*1N%OIEa_9uv}|m;
z8nSN?d=d)^OlGP2h6^l`yP68sZ*c5{Hu^fr;sOv9b^!VL#`vhN^e~X875{4!_0daj
z`+#@oO~{WYJ<pRbETEUxt~``O!utUixM{7tpSZ61+~N)-uH4IV8PLDQlIMXu`RZkE
zHG!&#r&n9FlzNA;mBdxy&Pbdt>XCK-7umMm;Z!p*m}OD89O!dVsA5rQ`fmaH<$bpc
ztjvQjR$cWFug_h9Xv?;!C-;U0_VZCh;j^72(j2A8>om9ZSk%LXkYcRC7CTBMD&{Ao
zqw0E&Tmcz@znQfKsUUk!JA0&O9_dOK>dq*aYjL0cfgEbJCt{>6k$buB^IUmWzM2f<
z<$xv+cp~Ll;Tu6#(8b)WY1DSNGU4bxuRrQbSr;pX1Os8g_L<&wGx&KTW>#URdy@Jj
zHsWYRY_Gza_15JSKar@E^JSDI#SiLi)`*L<XUx$my_`Hee_vHrnT?scq;Oap5(*l@
z`N&kn<~h4~@w3;*(m~vx9Zc6YKfW$omOwLgZ!V6&zS{Lh5-{QIR}Kmgrrz8xMMjZ=
z@D5ZYk_Gm33W$V~o1ad@xj8T6;s_5}?{_;9r_S-a1tGKHze@i-XQk60HLrej7?&uF
zo!u-uAGW_)rY9H%Hc?ch?&7ZLpK3RTJ9^zC_gbY59KPWWb}Lw8EDq3Xung5%FXcMq
z{rt$__4H2!5w6p4W!M(Zr)ymrX>P$E4!Pu@#3X0^>-BvF!i^O>otilu<GBDTc<}t<
zr`^nqI{uL~!Q4=}R^2Toi9LI9ri;@s2B~GcE$G$?Ewx7vxL^Z?dDgk(`Nkm)JftwT
zc5dEm;m8oj{>KP{@HegE=U1zv+wW2t2hoC6)X}ouO1>5P{u&j~Xh=t0c|(akEV@<b
z75YSw?`==*YER!TwT7)aK27Bh{O%bt9_6V{qtRe7V_7;EN!lq-YJ00aEMFul<1!<)
z7kiRW_-Gg`b@GS;{ouKiHvtJ=)a@)d;z-ia1ikBhoEKYr-*Je){`NU%RO}$Tn%lZ!
zP?0_yMsM*SFz88Ln<(!-qgCcCaAUvtEc*(*X%?IC!%JTgLOhSVZw;hYNWzbQu_?id
zjZBm&O*9VSdGWAu87$Y)e(3($oS4lx3IFDNjDUaphJZZ<;Qfm8#M^k?how?}xPloW
zA;pXD6<%b^R}R`C%VY*(CsoTsdf=f6;ZyKTAn0V&hmtwK$>dPfYt1ev>}s(R0@GY~
ze8Sd%d3}QZW8i^<8VChjpmKC2@r#%{hZxbSA?;v5P*c{>b`kE)ZQ*?-;!!`dk?}y)
zj+oGLwY(lH`A!D8qarH?OsG+<Cj6w`y3g}+fYe463Iu*SIoqv^g<|Q4FUG&iWFiyD
zm|^s@xRd*)%|Nu&Diz8x{{{K8&qVZy0%lfIJv;4LSL+|?V?X(n!IKZbsYQ0VK9!Ui
zqwb)#?O4=WNDr}FlS>BM=cK*MXEm%Ss4ZKs_kHu3;@8i!gSW%Q%zD?Z(hM+NN?{qY
zrvATIU*M8rZvHY|l*|?1VMrJ2&Ws1YsHJ-wY|~)D74uW*cl`PD#gNdkOA67w%2$Ey
zbF^QoAluAUzw2jr`n2#JpIxMt;F}?q^o?x(`f%1ao7U0hvG#h^CT;g>i531*w2B{u
zYELypo&O>L180FSM^+>z_Yq4`>!Z({5ue<@nN@<`;gCFB>?^WDQN%yvpAv6mj5M_=
zg7tPAOQi`bI)yDk7Y`;n&}6aes&%*WSS*ZcJ+M%qQ#_5F%{WX(dAn+$rYRg59?9LH
zx->t_|9bb+3H%(lBcrndOq1t;(*OM2O|BlJV<wl4x{xHz!`LrD%Cwn;fNFRj_&lX`
z#6yF^+z{ng)QR$;&q^y1W~{KJ&3y<*Szwc!p3tIz6t`bbkbk8@7nhgl+ac{V0U%BL
zVvem?ZweBG@qYE84P($~8eYZ<lR_ot)|N`3u~E|xR&~qCACd81$<ifSs9+Y;d)GMR
z+rb!DaYSOuXlOw5+);($Y;yM6v|cD@R)^TtV&Ud5YP_1pFEV{s>s58CEJPG=Fb)XP
zII5{h)F|L?U~G-~_R-f@VFRg6hT_>>uN+9!>l%`m5x#b9(y>RCw{UlGzN?j!rc-ra
zgoFEY`yBN8u^W<}+t3`6Q2al+yNY&r9G{BOUmG&`qp{-Qggd-&X=$CzwJ5aL$((Lt
zwRKe?P_G%h&VzF`qSDk1^gn;&Q2qnEgWLFpOW2}GU6DP?h46jBFVL3r$d!qG0}O&D
z^eL{*mqd`WzIbMDC*!=_b@2(~HQc~t>JZ@bL?Iko1h~73tOy-H7%%%V5}*4HvAYFP
zyELdKW}u-phE%c6P2Zm#@Vo0n*mp|fMu$BH&H9VBQY<~MA$T|3Q_(QH;kBaHe-d|H
z|0M3}p|%Bgeml+)_aaob@=uoo(47%(K}DDWDY$_eP@}{%HQ%ESG7eVoxP5Z8+m1Y&
z%MbO~IB$f}c)r?_OI;cW@$<fW1s8@b18TZDB#p+g>{WkdAA<D;@LyW4QP_L`tlM*~
z?S)_4>@4gI(PR|%M1p6}its6n{f9sFGW74fj63qbJJRPRT8<FGrF)y@HIlT4C3cLx
z^U|%kI<v)4r)?p^zIv-Am|PM7W(rC-53?hB1tPV#kH#Cr<V$rRO&HdhohjZImj*)l
zHs6Cd+5H!V`wc+oXk^RP1_r#_OS`yX6&j7G2U!_i?$AuuR+Lq>k}7XxFB=rPY)Aw8
z1kMyV6cuDD8R&3f+6EUe+F=6pb%8-<z3E!<6Zu%jb7AyXyVnPm#$@!lex9OD@1(PC
ziK7p<{xb`J64d6F1|5q9aV@^=Jk%)~mfqL3Nfmu(Er46a{n&P!+Uwk;S?x%SYuNFA
zT9F;K&7B_81nSV+b&cp^+_&a(dXLA2b8eCCm}wi}+bGe5eX1`m!aTEe>LA7-iGsXM
zyE}xHU9$PRyU4X%HgHmBEwQK<D2Htup1OYyryI+XV!yYW$o<^&W4E3g2%3YR%;h>0
z<mVrIH-yHs=$U=+?NiL2c+0o^Sv3Xf1^5c>fCTci8Kaa1%G&%Cpb256T|ax@t&~k4
zDezT0q-H&?w>1ea5)KIzzgf~7V(yUTXPyMQ?v_)v@3YVB`UWpXD;y++O~*O4A>0#o
z#s#u5{7Mb#x7$lM<bwN|laRM>QD6$qhp8v)KlziA322O~A&tj56zoQkeTMi45jW)k
z+4s}?{TbGj;L6m&MS45sTYjn`&Sj{#=z6Qfs>%BMQIa0E@6(4`l$Zi4d*!GtK?fu7
z9c_TZ?1Bd2`^R_E>r(~t$wiMHhMkziH;JAbzxD0i)1W&sBo1#{P&Ykd`_9ugj7#q8
z=<ZL~!<dq^kWP$0ye)P-p8)KpN-qZzY=gHgiHc>s2^~M`BO)1VtsTQ^z;*dt4+Nxa
zEmGgdec91;_!}z;o`l}&DmvqL4Ojj%`$7HAYIXd7&Q^!63cx&ZEq}s5d@m|gMi|hR
zguabE(is(;B=e1Hz^5%dl`aG6lJU|^B=>Ayw@(jKA{07H%a4i#=a=Ig!+qWP-rI_Z
zuJ8(sbeBGa^UMyn2y`1h6wLCKS^8smkRs$vsejpp2eNrr%8Ohr8}e*syJN=+^mDbA
zK*a087Jd{Fk7>ia$2?KrkoWRMV>&Ow*NCyNVI)BhZBJE;E0kAZgK%NIY_<<+8XKrT
z+X2zk2glvtO|-GT?GO5c!nxSTX};X^AuJT}!d<O2Ki+Om9f5HnBb=IxsIApL3qMf8
znIVw>c^5p4gwq0_C-b$(t5}Xm_6NG8E@?$`(CzmIUk7Q5*edHxqO&P*WsfUnC^?3?
z=q2MxYv7BR$C>^?Llm6l^@@FiEMyF(-&>CzKVEh}eE#r{YoBoWe`0Fd0sjqC1DQ?#
zsse85{=up>z_vA4xEU0#t9=ZX5eR?AKP2YFJzRL+(RAA1qL|ZI;GWzQ=MP+&lg@a?
zRTT2LU|^N#g5^!riutjHdl}Bo_ZllGMe%M05{L2v?eA)nwDe2AuI2iEXrRu=)(iKm
zDfR8rKiF#QyxCI}2>FxKx=cO);?Dbk{)2m{JItS-j2_FnZT-k@X)kkV?a+}d^|7nt
zP)=<NO}d${>C!2}V#dnpHTJ)dM3m$ZgGQn7JW{qnFxUpjc4w{3boF8fMxU4dgQ&8b
zfLBN^{D!5J>^8Cb9_!<O!Cd|;@<Qq_EkkOd=GfMq=H~ftfgy>Dzr@yssNcJP%O|xH
z0H{@~y&?Vody;=)TmM*-1mzSq^OlMK-NJN0E1fKX*AfFVGo$4GEfS|FyS_@Vfywap
z|F|*W4F127)%w4Lr&BDFS@vD`Bjj<R_b8&?N;_Ng6<qbX<J&{n`~T06iBg^~*@6e&
zUhlxsuAGlsQI~gXz9HdVag=tS;3<PV_}j+3$;Qs-^n3>bf{;u)kFa*e1m41Jamd?z
zuFNtAfMGl_uk<;j&b+9-Odzh{`8~{~r-Oy-io<=PXIpI1)av!46cgtv&}bL=RU0;N
z2Ru^(FnWTtB&K`bC2B`@7@8v<RmBkLw~+V4<2KV5+641#NV=T|g*p&MYVSXIHKf6B
z73)+?x~IO|{TL4$Aru`p6mR>iThGAX3D{7dYJ?`vzlRB`e8K{Z-wZ*SG>kn`2{F7d
zNRmCYsypkhYmLOzj~wcy0OV3{6}kww(Bz$ZeIn34@aEScV%*g$JL;pfCMb$#vhR&%
zk_ZT|Jw{JSe4ok2YR&2H%sfGt*tGQWY93N9C#xq9kMZaj!pGuaI6+=2G?I@fc$6gc
z_x+(EclPJ&LS}D<w)cq}VcuZ71M1r6??X&udU*a^j(d_oa!JpFR;y`5+&Yl0ldB)o
zN41pGxZOb*HHbwejl_M<4uXL@EC<BUQu$BB(Eyc`I*PnG?6bGRd9dUurp#xZnFyAQ
z`rCB5pu#OIMNBlEBj(FfPPC~yxgv77JzlVzz<Pe}3<`6qu(1dzu*5=6d_9-~|LtS$
z0KnccmE<i4S8cNmm)Gj^!(_3ZuYc^j4Z|BQu?T!RCNRsYJNVUMxo5lC`e~2F`2Ok=
zu!D2n$seQC^eE>8#7e+23^A*<ea^>`?Y+z>Y8OC4{TW(8t-6R;HP;PchJp9vUovhV
zTLGH?#_3B3PZ08KZV#&i;1&X1OoRYt?1q@y<-Pw8-mO=WHh3s3@yQq1Sxxx{Wl)|O
z1<B}+GcNhPS|pyfQX2g#*?rbYKAZ7*670~ALW#LH2OP_;d+yA39nqKYUn5xTJAU^}
z0(<&TN)6FooVJm}UXCf1awT&Wm1$g#9~W90|H+G;8@0S=dY>bsYB|j|I9{T07D0oQ
z-t3$xyXaT)-)S+4jt)`#f0AOPb1@C6^~(-nUN1Y02a}b7TnIZheT+ILOOevl?|Vn6
zqM`*mvF;!4?LS2`m6zx>bI<DeKR=9&=$nIzZM=&fov<Gq`dk(ot&87#wiK$AX|CXt
z={2}jD)vSZJjEU_Rco<-)2#5bxX`AH6YD|TG0KbZL;?ExY0Z15;RtUEGR!>~5}$5b
ze6@CVM8_r~)y^{bj3>WeIBZ35cr<>ummn2FxIUa%=?DF#WF+r^cNAwhlkTjQZ)lmK
za;M=hH=@qt+K(j~_K~Jp0<){#{mchy`$oMlwa#tjPr~I(3`wr*+iMJER|9VHJNi8F
zNPG$`U3_X!*R%M&3~w=@rbnI1ZVPDq0K!EQh&0@pt*QS978j>VGUN}nCUxX;0%Jm$
z9ra-fkWq@9j^a_sYe@G2MG@&IIi2l1MT(vASKIBWFU8yA8(ovW<NAC28<(3quYY^h
z7PF<)R6lW9-Ire;*c0fkijF~J_=0$k&q!4cH&Pl+cK(F(&EmijQJ?wVXScAOVpB5#
zXMpGRDFw%>U7#ZahZ3=Wq2kBLDU7lZVlRbYRAS0Thu!$TFq{cA{W7(8<(woJc|P8F
z&L?Fgs*&36PSd}$Dk04QPBVGdzs10~Osy3ba)hr$8XfJzo(NKm=YLXRW(%;&`v#hy
z+PPuOo%g4?eo0;dR8#U(hZIZ>B``E@FVi5B&YFyHdG;56Tt)SID#hAL)bn9JXlD49
z7FUMy^qka-$bYX@<&=z%FZvoi2Gn<O!hg6Su(zt~rt4>rP;ggFLTKH+CkH(PVLU^W
zp`YeB(S|rcXqy>s;`@apPY-4T(PV+Zj!F$t;5I3dK8dqJ!R@UJhQ(=JP+81YI#Dhd
z^#e^`!q8#uUg=L>>Lg?e)DPS;=O`q6k2xD?bVkteD+9^}k_i?wBQ(UTTD2*)CI?kJ
zy&l=)TRk!@rxp|bEQV04d}SR^mcJvKm;Ma?3NyEr<+u}0Tjj<?i+e>vsDSPICmQzo
zL#UI%%?{(;eMSf}0i*SzE8CvEbs>w{Rp8}ZktK(6q#|R-cQO+UzoSTcjVg(M06;pP
zC$+HM07^Jw3lJJZBOGh==Binx`AhJH&FH8kuey_~9>wu=%9>@(r`!X%=6o7;CC4jI
zwdT|l{!RtHV=Jyl>obJrK;LNDeJ}kZ$}5FG5*51aO^Y>kz<`lHx?FFSwJN7>5<grc
zl)-dT;xM~5x;}f?QQYbeXLq352d(@SLw^Rc5%n@X0oMS=Pz0d6?4tp?OABTw?l0Zd
zX)!V=DFHXUOs4(lX4CJ~AYkg@Rp(s%<UsL;s*1#dj7N@o3bXKDS>=Q^3c!#WVNWb6
zFVc|$ts0heS~4LPE745T(97rrfBCb#WAXRat)JoCj@r{BG&e?AF3ubbv*G$8fiq%2
zL)&X1zBV`zt#M+7LZ?Unc5k$&Q|ne%>-93i0TJh=XR-#HBK}?Ck&vdm;qAjo&ip&+
z=GQWa&E)Dec5l=)lE>$(09QD=X?+L6O(A~qraG^{>O}OP2#?f%%d;<JEk@_JZz4J-
zP>X_e^1{s5rIsIugKiUSz%MTDp1V#t(?VKaXRR8!fKXyljHIr9#}<>+&qo-rRX`(t
zqv_UUv$lYBemUYccHYXD6?4CT9w&=^jpPZR=#Tb?GVU3~>ml9rs<s7twK#=J!aV`4
zU1ufkT3c%DL4i-|y6$<#18Evki8R@2hnY(>n2f-;^;_L2|3Dm1qSMjxz&1vpE)&0i
z_H5DR=Nm2!OjhoRK5qa-^Y=>Tkhd8q^}yC*r#P6}T<RdjkKaZ9lqje|gY`D?@H}3J
zt6JP$WZPz5(q}PUM0Yjwo=d9^ZYRaEOcK);JqPvbI_Ge_tkmgbT2eRpbB78G^((Fs
zq_>p#mn?cOMS^+uv0AnZbg**0&BwQN?wYu#vu1hhJW-QzKcCo(vSwj^Trt7kBJxt*
z^u@_~SNVmqSuK=W^6wR)6VaMCheH55My9-$MCoiX%cf>ftX7WT&bW<oV+_D!m^5gK
z40;=1b&4RV#mp{u2aX7lY78=cd-{rv72bSeDN<<zRQ?YLGWPF@zHbruRKon|V{#!O
zg5R&aRT`)N10iEaPc@l6cHM3((J*-)GTKo)2^YsiweWyP<Gaps#4kauI(YTBMdR*M
zy6l}w$c$yllODLCRmrx(WqhAg64V~%5DpOlAY`dxw_4cH^6C6kKoc_o;^py!m9yoX
z1ASo@yx88>gg9|b42ACJy=?lrHZA}V<#_WxSS>A{E>YAeNNxESKNkK4uPxKTWC#zU
zO*c^NBmVLej_Tp`gf~yny143nxbKPc!%Nhf^SV*iC*|xxLd?#G{70hW@lRYYPuIUD
z^Q7=UR<HU{Pad=QyvOnrM!#X_X5GJEpIgBnNw`v+;{O|?8lPaj<XVWmDL8O(-j$!W
zF_AgV^0SC`-_EzzwY)^Tc2VXFAPLXVcF|bg^|^@6%RjEOc(8gOXEYm!_%A{XYD$3x
zXLP9!aVfjgx_-@ck*~W$|J`CcU0FjX4G*b&<G%ng&4vF2#LNq@FE^!`qFDE`WDsX?
zvRT6qN<|T|V=<U`ne>WdQPWk+A=)yJ*sZL%hb!`hJi}aYp$6eMCPN4pD>PUaUUm{$
zKv%juMN*S>B7LBGanFaJw5ZAjatt}*k?5e+_;Qs6<Xcn%775y;FFCp&jz9>a9|UGc
zo?Wu~ErG%MWJO5J)?iI>Vf<O~qiI1hZ4E#Az)#B$Gl+#?O-~V6y!@LAlt5MYFB%0&
z9Jb5z{cqK4KI&U{RRtoiqccSV3SWig#0rR@X2yzc-UlQwX#&^KolLLEej+&3<HX=U
z`_;<HE1{Xw_?%Cwy5c`_$Lw!r)Q3tr*2s_MY(LMX`1sL&CMy6H(}SqYhS8^NxgNKe
z)03|p6}V7C(`H#%6e9?^U5{zL{85jq&ma37K8(`&eANj|Bd|I+rm*dLc5#G&fhkQ^
z?{}75!9s@DE5HYWf~jy4=4b!opOT;$@j@S!q+`j~7WW7f%J_buUr|2k3b@>&P4Hu3
zA>y^B>!Y)mAzm<OVqUuH7c;x`PfqGrcGEmKJ=v?RbvRmb5RI>2o<%CDGsb-^sP{e(
z_5gw{D!tR?-K4}8w?=_O_i3SV6n#IaKtT%UcSurQ0G_zKTr5|bA_nn;n{e!5HXOf7
z2E0SNC@*rpSkPnDw(9*P4PI>}l$MEm_m;;Cd05|*IQ=#)?spa$ewPF;K{LH765O0p
zq^@3TGb(V~(@Q{KH0@?0<hiXLh<ent!<-B<9Ij0416xGOdKpzMsKW$diHq+HGdoLI
zN6wUJt<5!R$?thpL;xp2>d`dZFDy}P)7|+?Y?7x)5&@gKHTN^Vz2{S=yZcB&z7Lma
zSX|w~kzXHaRmwKIzb)4!mpeD3nv&dg@l<VWq&0dguXzm}FYe7D_8%8lSX>rjCr6xG
zF1G}gyAP?2`V^w1W;~MG|2`-zlEm_|N@8(NVN!pEG*!oD-+lhwBYkYH2PFM@MJ3Yg
z92<$i2<RP9({*#>Sc#Wn%BC!aMq$HfJ9!wC7=3d84@`|~y;&{j$I2HljX>S<A0z!P
zm(wQ_PJ19Waoe)M*$;(Q<_VEcakzkDWH*`t_cpatDD4g$)w=pN_n(lO>{~hw?RaX<
zbY~Z#vVVJgoWDqL$_p^I^C%Vn=}4r_8eV-Dut{2GD3%T$Q}NeEe?D0E8;Np^B^Zcp
zpZa2v;*g~*=4C6Bd|0fK_h(_P(K7L>g(j|!p_{2Kf$8qpe$)ucI)DfDkXdW5iQ@P1
z@71H(;`*o0^^&Bze;WbyCtv^33xkn3ts6vQ^GYP~xsJiYUTdIzP4~7U9rbkFO)<LX
zC)--gB2vqAc^$TCsDoegPQ7Jzz-Rm$Hjj3RXDq|C0mR9*l`ipfTz=Wi8R@z@+$7q)
zv^x+%?WsOqs$Xj)GG-=JPW-%!w<uk+$1dp8>rNME3Ac2th0ij1&%PoEws1C|RHMpr
z)Pz&_N^idX{B-IvC#@ma_JoSJo*Q%*ZnRI}$5*%V0#(cZrLUb!+wWKj7^BJ5luapY
zatn0ln6JF-DK2k4r8IhiM8aiY?^|}@1R1WumV2XitM`<dCsst%DDq!u?`sL}3O!Wp
z5ID3jH2ZIJp7+$AUizOwc)w#JdyiEUDxsC|QW-0(H@6C9f&}56T*l|gbupgVUOS{#
z^21dRA+*kFr8hT(O+3<Ppb?FOkq<&AtGk8Znqb9Xb9*44zB$T(pbB#^%D26F#y<rV
zBB0wim`im0B<9-_KM@Ew;;j4kuKvC*hVhZtg3D$R_qh3K<{s5azrVYS>v@B~Pv*yZ
zt$f%c>Rsgx=A^$|AlVK^1Nq`~fiw;yy3ELAucT&;)u*%JWXqB${T`=JT^X&+qq0c5
zV|&I>ro#tQYa7S^gT1$ks-s)ib%R6j5Zoa^aCZ&v5ZqmZyF+jY9^4&5aCdjtpaXYz
zw^RIc&o$@TZOwn5i*tAO)kvc;sH)!k$f)Yy^F8lU1fVrtl%X`Dt!aA^W#FCV*K^eW
ztzp>I-Quz`8#9;`pG82e*1-uycem8&QO~##6i6xxB!6nr^oH>gZ~tjoTp^v|r068V
zjd!}*xMnBEEqtly(9BW!@gE>J<Fi=Vz-ydn9gXx8OmkaJRK5psB*R7wa~!s|YhM2t
z*YK4^+k-2@pOnY<s5BmF?UismzjMFqKooR%mSHrEIy$Zf6_+(hdKF<L9GC5bWAWA)
z<DWA;xa>Wi9+T<&FdFFm2E~%=c+~3Deu2_sR&H({&Jl<Q#6a(I9bg-0D(#7=EA))5
zZ%^n)od8`bYx`YrAQOt8Od}J;J-s@r0Jim;vYkeEg$I*nOzIo9u=@8UNNH^0WE#b)
zJeDc*L99V<s|{R}-;2Muhqua_nFc(a>fap_-#x!I%{=8L5DNt%#sX5p0nqJA4D}zJ
zp&${dgrp+d=|Z>S=3}QPGGBP&=#PhPJYo!PBV(>`WQN+X!Qy4Uxc=+(;nA(%uA#%O
ze^7Pn-@FGpCw>)<T?jN^Nmmz)9W(s-aN=8Xv<IM-c0e1#AR@a~q0`7G&Zll<@G!QT
zFrVMs%!})``=?)r%?3=9R?LL}gY|L2WvVu2Y4!R}s=<@^qB`dr8OK!W>}~Zil%>rf
zdo>7YB<V?C^ao;+*LVKcchoikw#J=S>v5zW&JLq;{oV=|=k236Bgxg<u@2J#`nAFD
zhCjPB?PRUC+aLS}$M6J9?(v0ynCreR4*=x!O4LNOEhmu&ev(xew2GCY5BL7S%uFX)
z)OE9)F=OU^jFsbO-98^KXDUujj<EluteSG6@Ac%~8rAH%rupRAk0h7quD_9qa)Hzy
zXm%-Ir{qR*W*pq^qh*XRm+cv+GIqXCO`LA6%_RlI$ZZ3zr)$lWU(ETkT+G(C8S7K1
z4JT=<@;)vQ|Df><wFtknfAdtsYxYl1@bSM1Y=u@6D8A4In%_PppEUMs?fkV1mhT-5
zd%5Z{w-2F@_-+q85^vuMw4et@J`mrnR3_ZM6_(x{ekz)`TBs^_K2<1^&S8F`l+61q
z!<;(;ihEU{@x9~?`xL%5cD0`l%8n8K-e?_ofo{|&oBS+UC{ReP&=cxF$!5)^8k6;*
z>WkC#m?OFe!Zgoh$bxPBgRB5-0n6vCA&E!*C5tus!vW4f+>ewe?~D^<EFNYC`)-j6
zKHVNIjX!lxQazCdaP7*{PdLk_pZMKYXpQDf=gXKDl<{T5gSo<~YGZS^>y*KvQ>eM~
zopUDc-{n?YvD3I5=_XK}T=C<M50}%%^1nDruVr=xLBP~r>Q*%@Z`+`NpPUklf80H5
zr+OmR@)~u^FI|{}o=Ny8KF56OLU`F!N;;z;%58sQn>Y;h<v4{^$@lJ;!$Z~ZxX<d?
zmt|O$otLLedbH7W@`C)8-^OtM<Or1h@1FJ>AmXs|m59v>xyHSB@tJ!~hH%l+_o?KO
zjOOCgcZ-UHol7+bA<^X-Bc`b8Vpdv{nI#&3$PL~cFO6Pd;t5f_pWcxlJ94DQGbQp3
z6Ct9Uz0BTX<R<zk*(V9B5{>2$<v9(ob1_klU|j$`hb=M&v~}ebSul()ThG@G)Wxbk
z3<jTc2I$?^^jE(75`Vxt`cdHX?~_?J<hE?omOP)GQ|85;-h^9*+9iHS3;O5Hg=g{2
zrNbwmY{;|mOdq^v8{;rr<MsD*mrce<(jua+Yrc<o>|EF<BScs{F39eSkN2;F3n5Ur
zJMFL8G5d^3$oac75@_kaUfh~DE`T7sLNsD9Hil<3updTu*J%tV#*LeiOA$+?16oNd
z``d1oDre7)LRlB<e5WfnT|M}OFZ@mkNi;KcAAZ<F*7(E9xAfgd5_t8bg~{o{!Ko4U
zo+Y{-*Uk;-T^B{{m%T9kZp3~iqXN4V4_NPvA&l&4I$iJP7k75r-zH=#_MIT+S4HLj
zEhifw?b81l>bd-UHTW<YuwcP@(k1{|=A6fG%RiG$5yS9o?FA_;V>(K2ZNps|OE!~H
zQM4$PoO_$V7!AytHS6hSsu2HrYB&fVq(73xIqZhiVDE46(YqO20m-+4*5iYxGY56=
z1~t~8P_AWZdr1_F(rhBv{G`EM9yU)!``}ob+vSLOI=2nhLZBRy8ZC+1EAx#z9=8X%
zXybRQyXB5dAJ;SA$I)zj-6IxRl}d=hqCT52HYKdV=i?qY!)<_mW2&^3sZ_{^@)p$#
z#XS0N-amA_sT8Z0+&mNo-PA=Rp+a`7-KzQflL;40Rts$A72~PkSH?qXh%O8a+|J))
z2)l4;ybdc}YfqDBnx<oLoAMRk&rLI}(Bmv19sBC`yfYly#JUQ1X!|CGU?NtVX04Wn
zFM8q0VrQA-XTDh$QyGJP5T@II92P6ApBe8T@*dljyeyDOL!VqT+y(aInZ0Gl`nOc<
z$-Xlo1fN%Y`VI8uTqc=OTjF2TJC99*ReIZ?3|cS6IEAO^7Qgg^IyKZfa|VKtG3l7R
ztxe^8h6y(4;XYx#H7pAa8Wt()TDU^SR(=4V1=N2OfZ{VEHcqzEv99ubQ;xrA+la2(
z7D7eaCf{3$qWD$!Gn!DX$lXph(L$Y7s#2NKWFB1aAjWb#wNQA=heJ^?=Na_yN>KNY
z%;<X}4`j|=hzY5Oztnv)!KRXh_9Wm>?iBIwhC5M%MU#7PE0Vy$F?c~$BA9B5@OCb+
z6gJJ9(2I3*fc;r;jT;f{6e7$Ylnw6`03Xe0E#;E>fMIF#;`qC<qX~9y>D?jMQq*^P
z<}0Yh;#X(-&gETLbV7SRn$2|!#0%P?9jgO=q&5=9&0oTm2HA9&)ct|sTjzP0<ZMMi
z31f)4=`*5d?e3V9_ZsFl)<_18DqDhd@ZDT-4y6)5m6k3XUljAfk2c;W=gI}!635Dl
z5AmT$2~tflw3=+!lT97>JH&s9`t~*-f=j*`N3a<#7alsXDG@uk4^|oPusal7G9Nva
zS3mAy8u%7Oa@m4PBn&;~?-3I*>2G~F9)%A0y~~SqOYC2k0I<}<Xvi_DlBQo)i<7ZW
zU^Dz?vgOLWRsu!sOQiR`actdlz4&9N&o45z_|OFLp}STZ4T5Ti;G4dE+pvUE$Lm#A
zCfRg0Ilu#8_x_mRYm?GA#BPBa<9<YuVnp}!59OJ`n`nJehto_77N$dXOP1me=ULPr
zo(FgALP6XzHIGh1y^>ySa&tDYjZ?TZYZJd*{v0w+fgPg1(R`F@MD2J$B)0v|s7=AZ
zQ&ukBzmUGy7U-pNt;!sAY3O1Ob$wPfP!ErHW{Lx8_*aT;wF!exLG1|<nM!(-jt~Ys
z*C;$@(F7Vz>yJIWk5{bJ-}K!%g%Es8niOd9f(@y5&NnO5$6yR04^bR{Ow4b3k<9%5
z+AV$GX@(W>>U&^BpqNRm9Fk0cLsKdqxrBO&+8qp+w#z^LJ3@Uv$8QBfkozR`D_@Jv
zwecs#CoNQA`*QI6i<HwFb?xg9@NH|FWh%vWHI~!lKaH^1f52r`BZ;jXcud#8|Egw0
zgf50m;AQX9Y;u&C<~*HhO{Ba<k!Ey)>ayDZ$(T^BOr^|}I|e8n7$a*ZBWu1bjj2Up
z$`^uQ_j)3;lnp*6Cgi1G1Kn<4n38xfWcwsTfIGHCW3Fxp>P3T?&c9rK8{D+oWi9G?
zvd=-OB3l)U1<F`~7Gd91%1>%yv1(2xn=`QmY^?zcTk)izeqLL<v?j1eTd$@XAVnDV
zPJqBm78CkQDm47-39oyf<fg*O)|@lubVa7ca^IHCbqZ_KcYS8{a`=n;;JXAUngi9}
z!gku0mCZN0Dyc5ZBK4P;R5|coX*ccKX<ljwyee0Aq=W*KP^zNQUqL77Gr?JUjHm-B
zd+=+X#EH0h9HAN9YqjJ0<@7f_n<hU#(U4|q1u(+UB=5Ao`a-7kMIM3=+Of65KVNr2
zns0J<{ob!Q@Z&*pA-=2rp++O>8(&o!jqtG|){)EY1Uzbx4WA_vsP>d2S(7faJADQA
zz@7)2<eat-JAW$D5==LnENSmuUN!{~PpD(^E$2bN+ENDDPin?r{lmPcwvJz1NV=ui
zO%V4Ai1(6D5&oNa@4Z@9{<nyvUKc&9$cMg{>~NBRxAIi|gh=_sxL-3a5AuP|%(5B#
z$sax}j40o+Y~5F|f;Is>ns1!0d$x*V+T{>hIuA8yy7=@<Y*yjd!KNLOb3iCnKe^C*
z5)|}=P**}VIE-A$XK17;^yt|nRml8>!_8RP>{8wONbd@D<-s~xy19Kk)?($@l<R|D
zDp@l}Mt2xvFmUPXO23M9aHVQdR%WjTpf%s1@Ke8JNA2ZML*-PqqAK&0Z?_tf_f2^H
z2+Mm|`tD!^wqp03pTZq+b&HF(g6G&rX`y@TgimQ5{3GYOk*UfPg4za|G73<U+iCnR
zz|E1L@KrmH0E@IPRuMI34%l}+sFiES?bG;T)s*7r(?EZnsQcBG*42w-43j3W*d5vt
zeOLrF-$@Om#?68mE9pQn(pgEUW<FjIaSQE9kkt2a!4j;XqLc~@Pf^v)Zb?7-a6Dri
zna{YoZsy?}ZSLT4VZ#dAC)+A^8+un*ilG%$I3IcMtaZGM)57t1HFQ$1f<mJ%{$yK)
zZ_vvxAgZ<z1+*|*6hxlf#!1TctL10wxT>FaM%%*zeu!1Ia6>0sT)hY*ug`pCipu1_
zql08;Miv;SCFR>YbxUjOIzn8+t8C-Su6PDM`P*0zioy42eD0z`^`nK<`-iwsX*@W#
zUvN+jt`pH}&E$9h2u>LMG`FLb8o~ylLKdszp$xeBsDQ3NmR)#$6>(?_Uf=K7)Y*$9
zdDq=Bq&bn-GEX0R!{7?ObT3W<7eC8NHOyu<H6v^@SrUppUwy3JaJ4?{e$g|RXAz~N
z(D=<M7j#g1%m=Mu1v{g&k=7a<VQSnjzR5iYGyk1zd5O6|D$$x(`UM3ld`*l-KBR_b
z-c;OaEMm+x{R5~?(X?8SZAxsxL+<e0RuZ=7c(S&b%Gt+$3qtELVfJzM0@kj{>DW?b
zxNq&lM;KrS`{VGzxu>@Qym-tNr)_4}E=|=B{&NmzneR4VuoN0hzjoRW;y-P3+;ZO5
zdAK(`HoQ82Ccja)i=g8&j}w`%TSa~I))C2v@GRvtB3TovimVbbGr*A140WI#I1sE0
zK1!ZhcnZ>@1n<?Dw4gzLj_VJ{Gq1a+H%IBnf_WgNF9I8?*wtl#K-g<mx7a>)e7F0Z
z%6~l<N7C-|FhDo&d4|dVb~%|R-s_&PYEVK%w&b)Oc5OI2q<6t~F<HOxtLa)M43c18
zO(SQJ{a%R-?g$a9le#ujY$NL?1f+~JK`4{^y%cNYSMl&${#r)AF~|Q1Iujs@PqBOC
z>5*dk%J2sqpV(}!bT#bG1W9r5$m}$*Akh5Vc$Dj;bZKdpY!#|rV}S9(x)FLC(7eX#
zcEGaGU|pKblvBLfdmbEe7!9<4&+mgcFsM3iht(r87j4{ATcM}Hh&URdq6Bt3FmlbZ
z!Ey972ZMfKt~ykJ0{*C1M6Zu=|1mW|*cGr_vyfGtWa2}xGA8((k!TpRwH+79l&Ju=
zAv;PaqzC@l6l+qn`-pYTD~HiVuCU+5Z=WP)2I5WY=hm&1GehMnx)DsJm6Eg)j&h`l
z<ia~H#_tOhWCtq^`@;=S1f1A1P>hj@)fW2e!SXu+s~?A?umjj@i~z|S#K;qeptHre
z_HR<Z>s|qytnXZ4d@je|zx$*Cs9eFfmwc<TrQ4e?$9VLb=aoK8V|+#h7zg%X>sR}$
zPV!e(S#I!gZLt%Lh+D`I4xI<guXk{j$QI+ef3D>!qS@tiOi`eZ>7cz^2SDpXf)$So
zn`?oO3r*ZVt02NCuW;9NN1@fyyF~hCBMzZv^Yo5ymaR4eA8Ngj)Og)#F|()Oo#~?S
zFh5&+CPPXxgBTeEbm4}U1*(dg3DG5V2kUyze2-7(n6xrn`OabWP;M&drb3{;g1kgq
z_@C|*;neC1W*-*4`{9zO_1x@MNkPvseE<;qfyVLuW{v=u-5MWK*?T|iwhB4i5WJ{+
zv=E$O;Z!@8n{(x-g_;i+nDNM>Gb#0+V`{i0nZ<J@tO;r&PVus@-!D5MG5?)#B#qNs
z=X+D}UBqaV2E4bu%fuS$X>C$a`qiS>874m@tQ6i!H6cHaxu^BX&9~zOV-X@haVkng
zp^lciMeY>{fR@NZbo+EVwZc7blXr`h5c$XvA7YI|>41wb;$q79KU(;j`@iT2ZL_C^
zqpv!XrH!CuKD>jlJ^8}Xl4`i<i$JDJ9H~rhu&DV9<+Io8-TDFb+sw4fs^aq5j;X3;
zU0Hw4kEz{@%v(lO#Q<L(_Cja@@sLn*%ag+>e%>UAjsY>XKDK4oS!I&`v`3|C)!PNW
zj(s=CKg9~{!*D@EXFniVm&IJg&tt&kq+dnqXOTJ-NZT<5-3If&+A?kz1LM6*{5IaL
zw<dPFA53xWx?}*y3>y*~0jvPg0Sy`ik5CITS%7iQC?w$1dwg=Z*AzpkF5*v9IIvxa
z6z!Zs^0xc7cV!CwZ*-w=4~2jm&3W==B%Itkc#vmaTW6&nB<8iaYnyb;UkW=S7Jx~N
zmlLEy6WZKS|F^gPcNh{t$yU-6L{6ERcK?%(4){5oz?+fo`@pCFh(HpNp#VlYvKC`L
z(*JCvdk35n+B=fi{{~_H`}z3By&Wj-Ea5)<Kb}qWyEh*5U5K>bU)T9xz~R6BLjfE(
zkYbnS4)ecbqyNoHCjkC-zF<U5aQ|^=#Npl!1ZR^xF8&|6#Q&EA^L5o803j8+n*Z*M
zT43`Q>wVhkbtuma{^Lmj%&O>LDCF_V|3xAHFADh|*U3K-(f@B1dU6vV<;lm=IMJ?n
z+DLoukLy?c`t-HYKO)L$8NIpk0G+GO0Q1xJoNkrgx?u9zgYcVPZ!<M8q<r;h!L({)
zGagm9AD)C1C6all^D3EnyU?(oL~7pf=oAV6CpNQbV0eUmgT?JhwB66cL7<(l|B$<3
z{*<1$A=gU`8=ef}Q;*kADI8Pf6~B=H_hl&#w}ZH&rFMQD`;^4TCm;wKAnGLOze4nJ
zdfb$F0u}MB5omaw46^y!8;(0K03|seDwYXrpA>u3W1Y6#W*Ck9WL;am!e-8m@{GnS
z|6eY>Ez2qXmVesxCjZT*cMgP8WB(sGNUGlV-#W9r8h5Gh%_ej73P<f=`JSN>ej;Od
zZuR)vKdg27kG=G+oY8k<K6<ZD$$h(rOVrJ1X8`YydL^knV2y@x;GqHvkz~mGr!B~Q
z;qmi;izy!<B7p$h4?mwLwU~_N&p5hH0a4tWolZK<5~uKMtjPsX(B;?{R#1Hg`X?0{
z)a)dDT*x~+q7j3U2-*5=LC?2OVWd_cUkc>k9Nhi4*O1?eyYd%m#pL#9@>4AzJ%h7i
z6X*5UyF|Fu*jc|=dmQa%MwA3R1Q4_N+NJ@)Z9v%b5v}}#c6f!DF<{*D=rzX$-QsXQ
zkg3<2ls?KUS?h{(;<1{Pv^)*n!F#WSJ-^U#7?#2NaC(E{DhYia2A!_D*mGNLa1I8)
z5>yqnKsCOvifspNx7`F2MvAwJMr`{Yapsww{}h$=H(0^%8Z>v<84Dw0SI7PaJaNl1
zWtOe#8MPj9IslqLeiFGj8d`3bW2RkX&qu3{!<ez8#QtOsGu@`^1KI_GLCRjK#Q2e0
z{?FaISg2C>``hZ4K{uY)@ebqNt3Ec!saMd5_~b6d-q3EsI(Dm4jA}bZKX5~3*M11M
zZ$hn9h_82D%*G8?mgy9bmM}YRZQm@}L^<)Kxag#^s57m^f4$fp#vF}8jdu$ckq6r4
zjBXP>K3U9qYy<1#@(J%RZt^#zPNQWg$o;UCNsW4r>l*jqML{Bwu5{P&r$Y}qwdzn!
z{mM@uDhtMIMAN!ar0vB%Q7e(FHWcC?1m#mh;#;Hem-0=x4<XLr$J{OnKCkzxbyh2%
z>p`B?(tuz4jXmTf93%!hTG;>zOG<t8TP0;$6mc6UapajSyW3aySv_k>UEZCJrc<{*
zB1VUz^+^GUKy+qp_iB^2$3D=~C7p?pD;$znb}-o-1#<01w3`6-{_Px&03h*`@^7WF
zdlvTPElUSd;}8GjEUm`B20on;`{&zrL+j*F>~mnP2`y>m$rDR>m9@%FVVmsy1gc_`
z!9ZzJCV_@-M~G~$jk1LOjhqC^h%rz1mv9J)L@vhSKEWRcGWjtSKF8p)Sy8zxCx4kO
z(;{{;9w0g$OQs&@nbW}VwoTj`yZ%MNd2FhDg)xw*+86$V_s+eaQ@ih<0=`01<RV~i
zU`&vc+CK;>z6lX$@P;<^lVM+YmG|+1nj5$Kl~@t583oE@E#_R2fkB(~d=uNf6z~9{
zz*X8X6*z3>RA%)VR+CySJYlV18*D%;&(?6P7*Oy1hMc;}XsC;U<v2R~_>o4wL(PJ3
zsVU-0Y2iG7GS{GYTb7ADYtcfN<)vt_tQ>#_M+oaSFA=y9RcTW<2>}Jqv0_Ba=^LKV
zc!7B~>i&qY`z#vaWiy(j{~_~vthp`4>G07}sY?~8XrsLmmV)_I%3dx;E&WgHEs=sr
zo1HJ|lP+(r7`|EqM)TU!`N2X(%k;f(_Le7!*PWJaf^r`(-{=atc;+2XKu-b=#^&sK
z9ys_2q|2vO_eT-3?;j4t%TXH;o|r<YNqb$=R$FBH0QKr1WGq^ps+oHGKU5C9WflNm
zo2k(~KP>dJc-0r-^M|KBvgSc4+w`I^cYCb%bTs+Uqe)bzx_oD4=joA<?kv7oZEy?y
zbZ=lXmyf~xPhC3lc<zwE8#TdU9<V6S?07kbL94SNtk|2!d2^D+IaV3h!e~1n-;J}4
zYwYnbq7DrtvEeEfJW#2Ynz*jF`f{rgJe_c(Vb0AnSz>Q2Z{pzD0E~wXz(d@`jVHjz
zaf9IP+^>&K4`3-2LJ(h>N)W3kGd~x@em0Cqt!LPuwVEFYwzq+3GmT51J?2Wf1Tp}U
ztLL(x%=<^;4CAC2Dm(f4Z#aBnk-^fZjt<%NvTP3P#?ylpECerQ_I}lKvgu9e>4q{H
zdjYnxa<tvvuembGl`v<#*)q1l07t*%ve5(Q@cC!c`)#Dv42|(5-5O1{Jr+jA;3Y4h
zH|c-!>H$@+(;DOGP#_E3+G?RG;<Pq2)<FA*jpy5jHP#mFfKvmskD0D)W@HQ|Dlwqk
zx)jIV&qh3&w)rjx0~0PC;&2Z4D>j;N2040j!g&GZw^f~;-*ZzYjAiMO<z}!Xu4NGK
ztDjt{(a2>Eo#*x+xE%%R`rnVRsc}VU#F#!U_VErh6VL}Hw!&Jr0igA=LOU6bG1u2<
zD?8o{_$tWxb+=Z?fu(3gJKiCrMer*(%MNCxV)g#CH%pSu#Y_>o7xk%7nIA%cOVBN=
z!_88`>RHv$80?Ma8P!|v!4C60eTMN}2xfYl^kqV6xU}_VZj|1(Q=BP?kXVVcW)Aw{
zm|O3=T<OCoCH;kZ!>#FM6qDbr|CJRX<T2xg^&{)6=_%J*{;%Y_3FVFpI5rfpFO($8
z?(tM2`I!46`9K=hk1LY*3Ln9!L#~ii1i<3SKZ`_@<dEn=T({kid#K+eG%xY$tS@^o
zx34arkS~0CT(txev7|c=v#kttHUd>j)v2wZPYLcVU(SRM>*TiyIPHZOJKCNfORun|
z5m&<-kt`b2?a1k85RZFakh;=SHIhX;;1<zV64Sq<(swNSZ-G4PTapeiVP4)pa}K~~
z2C>}mP17^Cbu@dNKIhMR;-3@J=+d2{84410F=stzw+lW`w68T+qV*v$d%L<}IMDYI
ztzF$?a(~jTlV<#?K>nb`oxLi0UxIFdJbNh|F3^+2$-UhO*!;*(V2i}&=;BC3-kV-I
zHJ&WyV$$x!?*jjwE1nYYnk|N7*LZJ|bKPpjwfn;#8LkWIP`cW62_N*Nd=d}7;(fS^
zdAQk|4I~&yjvY#%6e0~m)JN<cA<i^JNkx<MGKc2<(}0RdFrVq{bzNJM(!mn3M3Xwg
znm<#dFa=UI{kjQ_Xf|+$T4R2h5N$^czcs+q=nyst0y;5D^Kchf9?#ZEmGW#+sc9xc
z-Ch7c!DL9b&}ZF@mYLtn(=+)VatEMn2!P5DP5^k%1%&>sdZT@XgXL;&<x&NSC&3QK
zo-*YSN1CLcROdY!(o)cf*pj2EZ24p9TvI?b&U`(m{xe1So^&RYItx6+AAYs@Ni)TG
zAm&qD!n*C=q>l=-elk}@O@XiIxo1-;-=0hou>92=1L!_1-dUxuvsQ%wSPGXNc)n!(
z4_6|<xJobIhk$K1AKm%0Od>ak#qx}ZGZ>DAh2(<&i6H{_(J$bS`NYIUwb^3t2HC0N
zDqm|65bVrix?*vWL1u^LqkA+jftbn?e_E*Vhbln%tkBg7C7>$%%WP7){vr&Q*TqK}
zpL_HNfaEm48d#qy5Uk|{`QObx;hzNr1H8F-Jhk!%X;((@NlA%!VBz}RjdDFhb=gsT
z_e!#t0Q4tzx9i5C%O{!fG*t1-U_qN<jAoM25Yh<ORtPj?$d~gy7s*#{1e|X0z93nL
z&SG6Hs(z93SX;|xF<N^!)K$DD%jK{`(R3Nf9YP^#0KMFx?%=1Ar!XZ-^z*@uSr&4$
z|BPJdTrGs%5PeK&65nNoc#~jirdUyP47i<;U|_CpCKBN3`w72GzZMr1i6x;~?;{(l
z-fh@(rr$*``-9EU0XJ-o0Mgndi<9FfaCe{RQ9u3m!E=zkzz54?sn&{g;c9O&K85Am
zr(J79hhO%c<A#=j^lCHpKm@WuLF1OcamGd8{BRfJY{&gi9oHAm-&*HiaeAN8<Sc31
zoUT)9RnXb0LV+Y|k@uSVzOo7BT4Wrv#foAF@oA3-K{6>Qo&i9x1~mUl3nGoq&7`<@
zupFOG7?;Z-!LMK6^()5A`fx<{b~_ye6oI?C@(=V0lJ0jx{tMd?>;*9o%TD9VIqBBp
zC4{N?elQ21N{H)&6>VuxMinfkRX^1g9?(@mnx0tAC4l{Z-3&tfMR;imvoP*{?2ZTo
zQLB1JvzxE{v<La{%NWSe)$pGA=$@|7?)on3SJ+G66-}E8TM*7!KP4yL!bs&U9PwpY
z4JR<wYhLXvft+E)?W{EFNial*E{XN79O&hT;wbmvc0!k7S2;>hhZ5*S985+(mR5@%
zy@&F?638BM+KH%(;{1j>OV|CDL~TB-l9L@&lfCk#!9me<0qz<Hh@xKdENL{Q7!g~m
zUzDJkDFnYK#85S2%`Blc8EAb#B2t0PVKJW)w`b==4Ao1C;^Zn3U8{o@yHjU{q%PqC
z@kK*F6PIjHoaUZ(fTkoSjM7*==&tn-G0uLPw82N5n><B!!q*udOC(7YZgXM{3|AB3
zgUVWa7Zd5jvAf5UaK3oQF^NbvUEPwkGx(>U5U#;UZM1cK6V7{nsWB(p;3>F{LL3#&
z&<g#O;YE9PODLiy6X3W~^74flWsW9`K|)vxRsSeNy(_D6zFN+Qnr|Mi2HP%EJzI#&
z%%0eC`2jIUJJ_K_7<&UrLgg{b4PdnL5S{S2JOj%6(Bw2Evaq3LXrdLv-cxIdGEeff
ztE>BBkfG+7lK;~MaG<a271W2R($f+>INl}+DZ{X3Uc(zRp4NCtLyJ=)Pr6bj5`wN-
z(zif~H!5t#Ng`LxWPAZerxu~|Ls0LcW`EX$A?Y2vnb86qbw$B46Eo$!LHrnsifOas
zp*5Y?2*N{Y<X_R!pL%|zSL0Db^oN>vd)%DF#=l4?7OGRuQ>1M{MWCaofMuJ%i@-yv
z0k7~2cE`21a8NUcMgkt_^sFmY*irQk3`Gtka>U+wakfF2eZcnX>z=pU9L>jb6C|?=
zeqMMZT&Z>&kT(ZFZsqq}PRHc>1;{sxs>#aMCPw*qHFC`+8v%ogpLET3a`4PJi6SOB
zZi@4CK2buF{1r0I;5`!2-z8NWrL3dF)P`!7gyq*4r5KB49zOfw?t@t%mR~}}WEt?e
zW}sVtWQUCScm_a~2L5$ct8}X^ULbqiRH1nIQbDs(><GF%5zQ_S@W$<kO3d-wrTfdD
z1U|eTx3T9qyK9=hp~Pp&I(N)w>+`kEdW-=sklr|3LCkk{d8WQUDK%Ub0^BC0d!C?E
zqD7nWlE2rpcrv*vha(>4jtmicvRnk4klb0N3E@tnxZ(zI?vm@cIb0sQ%GeAcI<l-L
zdmRbjiY-CixLWh@+rIO??Q#jWgZcOiUs5=PU(d9Sn*C+u_<6k<JmIEs6=xPCmgHuN
z#cdbS2pegwVp?!8At~%0eoPeJ-vOS~7!QBU2C`VB+p#<nn>MmCFE7<@*SQc{)H{90
z?go6(LkV}l!LLJ-j^G|cB5Kz#Qhd=w<se2<A)YpEq4C1l!Q9bqJ~tmgORyh+dBQJ=
z4&N!P*2Y5Q&x3loX`9qxAJLCcA?eabH9dfH17A$1+#D`%`>EN=1K-9uVmcA!htO_5
zCoSCmo=3V33wZ)L*VAy6jQvnIN0i9GlYCL6Ny;t%LowBakz}$!&4*jUkSS@ZF+NO$
z&hbCXM=qTbf$uRY{3LM@OZ``;@UfXS)^{LNf*o`{Mzx0Q&NXc=sElWZltMRpOBx=l
zjEJ9{PnV_QU6nC#BGp1QDh<$;KkG*RR`nU7B2`L@*N;Xk`PF)HV9Dil5LfcE#DrGl
zF7-^Xqr0R8<Zcn?Q%a*{Duj*gPq=XFuPW$`%>+7OZHyl)6G_Uf^{DvfQ5wgm&DDrb
zon+$l9t@N2<%>Eq$gXsOJo)kBk}6BpkF6uI#CIu3^*ds|Y<kL?B03IS5L3mf<m2s?
zc($P9$PUYV9=vnQz-g62ZU3hxYKw9lgHD3_$LSe%_dnR{YRBCNY&8u6Pz(B1+8)y2
z_w;J1OU*8spOP8esw5;^#EEv*;*}&dV9y{Uyq0RR0l+o1;J4eul67<ZMF0GtR-sgC
z<jeD^Zm8gr_SvG=thcxX?Ls|Cq-79=_hQSH@`Ph26}u<U55|GA-N4U%eYhOf<b3jJ
z@-vLrYIriUhy(cqFHS-Ys(8pV&x$yM_fZh8bu=c%kNzKr>>7!+SnDjW?<``z7`BpM
z+E(a*1>%G~RL8^g<rgm1)cuByh^sP|)NIK`N0p+`>EF@o^%}jV)vcqkm|r#{+wV$o
zBr12q)Tb3{j+(RImYnuu#~d_y7j;?Sx~5s__U_JqyT!b#WudyH$}hb`aL;h|gO5JQ
z^<jTReX-f6{1e~``jv<~_Snb|mFdoZn;axrU68CQ_?o`GGSUP8C1>6Fq}~BCW2we}
zd4xZX=4+a?he#Xxa3V!+;60+ny>B=x>=<y%L90T-!TyoXhQQigG4itEH1+v|QeO~U
z4N9SGiW!+qCSPf|e!pB=)+G<0LAasF2WmbxS-*2r>r<Ooc9^_7D0qxXF-tdSzj6bV
znw1HkHSU-1wBMjBL_X3?5e%iOZY;mN$E5jeRcl{+ptM=}O(|Ot>bF8sjwfkF3K$gi
z?MZXl5m`srN(~SUdjGsRe-{choUkqU%VkYvPCQ%=m)FI?8snn>)JOgybz1jYypR8?
zBvJQ6=8X86TJ3M8DoZ!N0tc&EJjcyxTjU+;Qq8)Gr~dAy3X*a5j@u{#Mm&jt-^xWp
z8h_l(*e%w_j7#m!gvvB)r)<1^Ul1<m5({K%OITvRGBviVPZyTBNf_3h2x7y}Z@?|K
z>SS!ol#K&L!hM<96C}~r`S-DyBEK|7`}0c${hRb#n{q~zd`pMpenNwzQb-FIa_CPP
zF}D6zuNL)lld2akCdfr&>M<}I*6I!D9GUj)ftAANRfi{M`P|nf!3@UY*0KIFXy%Ix
zN$K1bFt)|&b7HxH&g3Uy|Jo_ERc|?HpaN*-yV}?6Zl1u?G1nqt1&5t**(xZ%iRw!Z
z#nYmF-dENyQoNtL{SB!P4LLk6#z$TFddJH#xE6V~HTaV|x4Q`%BWs#J19i7<XQx{X
zxmv#7uW`l@1@dTBDEgDiS^Lwo{}c$tSV50uDo@t9)*}F0*&7XcI?sYkr_mU??C0un
zvK1XH4(qRR@E+=&qCrJr#4T_3Gm#3m+uh8s(v`e&3}E%uxkj_LjwZQtp;%y7%T^iI
z^j+pjtP2z&NqYFN>v^lw-O?WdLeuwm*XD82<b|SCKGPStqA0lU7x&4fkVq3tE3K$i
zOc;^UadmjA7Aquw`_tfl)XNTTf;HAOoiAmvTuAZb9SZOP^!)5SLIWGPQE9L_>L-UF
zic)}X42CUjwW`(-t%e&!k=pG_vN+Y$%V(WbIS?LOg!iORrP;1$j3Px*AYEU{sb;4Z
zEro4#-es{e)*GcX`Lx5&6Ny2C(pz7q+wRL$ZHshD&?Uv*C5WI&P;r(;lghDEm}4F`
z;75MKeEmL<S^8bN=kHb@XKF(7QgkZS7$a5-rQK05hXJ>TtFK}6KCjmloWu2-X3{9f
zPcTdsQM3&J*rlKS8FbZo(N$oWrDqk(io7qu<`zUr6`-x2bnB<bS~nOO<t4jz^D<%m
z1(`2t+CC;Olzs2JL4ennM_Gd0FV6ankO6nyI}w^}C8VDxN>*z^=lg$$uT%58>bgRL
z4piu-^i~jtGI(*d|L9gMlr3>br=qPBVh~hR*fOO=F-5sP-N6kF)3v4<-BG|oAQ}TQ
zb=&3cFk%$)rgQTK<dcP?0GmpM!1vw-V<G*I{pGgVepe{<uNt+76Ku{N`nqd=U8pM-
zv^r)?aX>#fXCT8O%8-Chzc(`3z_EHX;R7!XPl<Z7%87o2FsA`2<5e(JBR^z${Z*@r
zh&<vPT;oFPoJ<Q5<g}S$P?0|Y8a=qG2_EW2_?dqYG~y~YoCr1QM6$9VK#p>H{>r*P
z+32nEGYQ6^{x<PDV3q{dn_pnZJ59IE(YUwNEQVx6TIV>7Va^|!i1JZ=NycRromKo9
z*w+t;YLpu%=s6g?I&5^Py6_EM-HRdp<(WTp#BTvtF1ZDd1M?*pS6|^BzazCu>3S!V
ze;Ri(M?KF5)k^_xt}|E=J?R}LgqP3JV*{1!MQb9qB1*AiSf+5AVJ!}hC6-!T8IkS%
z)HXGyewy|VlX(WBF3fFA9wHoHXmUd=BKYbXeaopLL$TgOiT*KdH&$Dy&M1k}gFoi|
zW!l~&?~=TlqsD9xquFiZ<XY-VHPtoH&wOuz4f&v6*C5TZ0y#C<4j;Lw6|aTcT`-MC
z%Y9{>IZky_^Mjgw$n30hU)W<D>8dpf3Mc@fjF4{W&Ps1H_K`{A5F!Oh8wp-3gIX`Q
zSkPsXn0hl1OSfK*quO6E^fv@%E8Z=rPOcYYgg3tb@T@2G6EkLNJb0tux`|R!hz=_X
z9haVsR;~xBO8EzIwNz813Cv9ir@FlJV8Hkq(vhYGn?<%j64p1GE{;&Row0Q8{ZI{7
zGH%$9e7lDu$vJoEilWj{iLaqJ(}rAc(b5lyxFToz0aj`QhEm%54lUu2x@WSv6BsOh
zlsx<x8Y6x@ceBRSstU7@C%KEYnOD`H;DR79#Mg#xy21>};5<`D)9{T$H=%YxjRYV0
zmtE^o5WcUif~c)tpWx3E8=j}5X!#FX*3<5mg$V8mhQu<hpTq4Q#;4RC4EpJ)A7KbW
zri`Ay-4gU|Si|)TqOyd10AFwQ>rh>G80WS3zBw9))ZLS-unp4wFvzV{ZIDbOF5n8I
zn<UG0Jb_IxZBc(~x?^xQsSbp`QX;J`7EKop9*sQ?7G@4c)*olOX|O~TSF@!iHu_fb
znDuH{-gUYV(3tSuA2Cw_ogi`OB_OntUUS&vj}FpilM7Jl6?9Vl;I*$gn5F)$5TqY#
z$7N(zV!TJX`HB+hG|7Qv>8$()lVy$MIj&4vOrK#QHf~IxJr0<1@4GWpKY7%xzC!&r
zg<4VCNt|-k`bdyJ{{WP9Y7(;zaBEb1b)u<?;ZSWl{KII&=Bw--avN%8EK|(~nat3J
zqlMg>EMl|@wC3)0cd=24<qRk(t%<P!k;HYV(S|XT>9~YvM!_W5%u4eHuP@Qx0@8#4
z7&&A5%H@V@&uv4P$b?G$JRA<vXp>y_L#Z$V6I>{Rghx#JVjGtWopOnW%uI|}I1Xtn
zwJGnqCHhFm4z1=dAf2ab*}XvetN-UZr_}~Gc>OcG-#SZ|oi``SR4REZp09l}sm3A@
z;pF5Fy$g1%O0n=CDXAbZYg7U82DQe|0e+>l)^Pb<xtS8m6e_iBhLLZ-uFla-xrRcI
zN-%IW&a7<=sxT9`g&gz++S@>E6jKMahr$9NK!%_$<LB0>@DZHT^*aOveDHxDMk`8<
z5CdhGUeqnzH?{UQhRcbR#cuNF?u9`f=}e3B-(Hre0rXjw;}OIh<33G76bqn2=tM9F
z)Hpaaj24@l7(oZTGT5#n7?#1aWxnZb(oRRHSfFZvJ6WBcBQ4%N2ozB0z!^+?4CKUC
z(heuGg_!vC!r#Thed~+7{c=PomgZkQ{+ZFmzp;|oGN|J2IsfWC_g?Tld>gA7Gpo_D
zIt{qnez8d#OS-f5(!s6J%Xk!lNVSP{Vorx}R}FM3uAU$`qO0Z2nQP5j*Nj+{1hTLA
z!S1=U-?^bRkBViBx><XPF+B23k!aH1H8w-X5C%A061zbx7sqvf&p8KuUTUCeedVt&
zt?siNp(2Og5Wxd^b5uk@6n(s-xbT?jNUw%WY(j@fVe1S~y$z8P-@<0C_E~P}iRj%7
z!RlQy+q$UNyv>eTHb!9(eQlmC57{QO`Wi9)5|BskfbGF}OVNpD8=v+ikS*QA04Y;j
z{ppNIpEIoZBaorhO)u0R2&7#zn_9!T79i)2n#a^|rMCNFVsm-s(HIzNsa5R#Kmqs5
zQXrGma0mn(7pRX7PI30nbd?N2qsQ)72c)yIYrahvq8}7p*N;>pn=i+>jrF}DY)nFT
zkAMP{t}#TdP&*`Ji>-%GJ@cu`{^a|X7!%!<HT!DM{?@w|v%#s7e4noYGQ9UVbTH7!
zd-UL#JjIyBXqIMiiOV}{u<Ki6+FuJs7pjdhX$R!Dxd?H=?JVv6Bc9LFM-6;kvkk~N
z#<i<Bun+Kw=yyRM1t0q@=HhQcTxlTViq#el;DNSaDgR(3F(to(jF$6#HjMerW-iA)
z1{)m~C08$+6MxrT<Aol`-FN~or3YQJkQ}+~XN+H7{bL~?O4ad}?^w?Ckt@N$YF;{4
z^}Sy{*er&OY7|%ff%&BdNU#`Q;@D;I>5Q}Zk5;Pdd*UOVBZA;PNo7lg12$7_rhn(h
zfx!#XTy@DnJhUDB`Ar;pSgrS^ZxDvg)hh!O)_YQz%f9vy7FdqY<&?mPuq}Y{o}JUg
zLAvqF>j!>tnyj#MX!7&;e!sx5o+1inxJr%P`T6gGs<;7b2w0Ko>sbW2;c-U&CC(K6
zeVo*3u^TXAB@}Y$O!Z;@ig{n~>hzICcImawYOQZe@<iM@=8Ypb4ly{EgAJ7wG15+P
zV9K&LN)WVU*IaRss*h7;IM~C!ld8<l#-**zV2qm#aG=z2w~OGfYc0ZaUNYY@I9Twq
z5BWoA%%B<J4P5+Am$0_-GlV)D{KYta&v{ta55*O%XjTmeb{159HgBowI}~2S&|<R*
z>AEPzocGW%bkcz1YgTP079^c2s3BxKYn}GAfm>~Zg!)Un!<QlLrw6fOh6E5<T{UWg
z^^1N;7|YvC*W?nvM!PxIVrxBrv@@BzO*fX}A#igNw~FJ?@F2gxC-A%5?HK6;i4ZQk
z)la^U=4a^){$@E_A#GT)oyTr^)g;yWC?B+?bOWD`R<ZCN4b!R<{bd^FrI*V2++(g#
zLgKs$QpFqIKBx%}CH5XmcREM76%{JYtmGs}t4+_EkCuljhaO_C%g_;}c3{T9v!WUn
zr6KA`H*zXj=e{>kC<guB=Ol=Hjj87ymp81|>>E}sxx&?~G*4+geIL)xrU!-`l<Q`~
zL%&1e3{D^7y0#g}z=oTFcPo?r@)Hyl!62Yx+t>0gb$`vf2K#K4<F*P6{sIO`Jc`g2
z0>k7krOwP;`L*``^7$^?4vkzw<7%475+xK7Pnux<3g)`A`g<BrEbiNLAmyhX!lCYh
z&jsFV<rR$YVR1F*53Si?QqPB^srv<vCw?H<`bD^mK+^!bJ@vQ2GCm^(?5@x?iBWtn
z!jc4?xLfn=&|N0x;)ZM@@bbLWA?T8xUeE|NwqKSyO>lw)<A-O}?Fok<vT60I3ATJM
z(%s%7X9+oyPv1Exj*y*e1<{)z>I`D8!Vd{PcGY9S1>A>pLOpl?h%YYrw%MA)o8m^t
zrqT<Vt5Meb?!72$?TN!SQCW|<)qC>sNd2yYF>=2c1(|1*-8aCbTlsM(?goLu*_VR(
zy5~~A8k5`24a37dF6s%Go$tHd?B?{ljg|C^Hq)1%z6&rR)P(99;XJ@)-|MS*2o)j_
z3|j?e5I*K4&CZ70Xn$8`F{A7nboetAZ%k&!Rt#hFd?$47(kcDZ*js-b+pW_j!HXH$
zqvaBhS~!uDx6T(?n4vo$G*ugMF|WG6({ru%-X`hS_kN9|!l@)>7-<wK-w9C9H))2U
zo*fkET><logMR6dI|hV)RmH59uA*TmozNEBmloqP)X(jC1VPTw)vlYdL+}KynIl<u
zR<OROTRetV(tNF3Xk*Qe!^k@Y?3K{Dj79cRr(^`=XjyuzRf#+-G8<y=OxTha(m0bm
z;LK{f@x>E7PHUx8(|`4d>IT}q$5EWm>qwI49-chR3l$JN-E~7vY&kP@^#Hzdr2El5
z!)%w(j&L_Zv+qEGjRr*TSuq%+<b2MPETp~cUca93&sLpw==JzP49iGov536K)W1CZ
z%-`8Tj!jW@nK)LtP|9;=cC6+wjFLft|7aTc2u;Cx$#zcbI!$$#kIQYs0#=SYm4_;0
zi55lrK%Egc9orI0#M0DVgeeWJ{{hFc1v#L))iQ0facz7-4;^VdqA6Dwx$pEH=2^>G
z5ka^f8@#lCZLZ#J?<bx#8wjgVOq1uu-;}s*u8Mi$1Xs9>$7j#^gcI~gm$I=ikvDY~
z$hY<<o<3t96VM&qP#$78SL-#r7@y(G;jQ)sKFT(lQ6$}Fr$cUZ3bjIjEHYF5=<9uQ
ze-)Z9t<9Zl_dC9z@40P~AUFlH!e|lAABLOujfbLZL3Ob6yJqSE_ifpkKmjH^&fL~v
zC(xS$$^veYJ;A;J&MCqr;uOqe{>H_)y??lRej0{LteM~LA}<c@NF98+P(KvHH{>!o
zNnT%U*I$NUk#9+7W}X-pG}6q?tI)s@U&(2~dM)P3pig}99>escp0E=HX=ocwH~(%c
zyxH@faAn;(pZBnMtP)qx6BZ5m49PY7zz>mL5Dsn9yz7aG?Q<H?l~T`qT=~`8V3;(m
z&ME`&g>s^7aG!V8B=guJ>}oHb!aEXJFZf>A1{I-9hKA_C2zK1<!hH;8fF&dFBpNEW
zu3)L0RPD6nDuB1^JSbd`iFjKp$GE6jW!VRZeTKb`T6UY*Ao8EteIzs-9GLiHPgjcK
zF9!pYr@i>H6&QE?7{XZk21F~<oNUeU!S+X8J@Vt5Bi!=%5bM2*q8l&krp~o~=}KSL
zXQ<<%k=&rVu-BdK`BXW0*dH{W{7`M}igO(doAlgzO<XxvgFH<~gkVV6bz2JXy`Kgt
zj&QcjvS>bytT_9VzyGifOt6s|gthqyFEG2Xrh=oJU6h!|pJqUxfoSs98}F&X*rh;m
z+4WSnlK^+&Wr%ZSSCgl-xa4KRhOb#qN4Hs-?`KpOrW?GbZ)$wUCanc)@j;v0>fqZ}
z$lV|^)2ki%8(UsE9N1qbB<;KN6VfcBP;E#q-uGnBqs10(^MtZw&O;Vh@5~lE>1Cgr
zoMh~Islb1HC&_~S{EwF&Qt+XuUeft<`H!Cd%h5Z~!!F6y;+9<3%j?}r{$2<vh$!ji
zR~o4DkRQQB1b~+=A%Vcqa8=9qXx;w0iq!w@kNvN|`Cl(F{sO|sJ9wNG-(=AL@$Uci
zNB*}L-=5&_@6pg!x&Kfo{PQ^f+pqlN?HiC_=;VEHB`HP!x`O{Y+P^L<`VK{2-(bU|
z@Im-rKm4yt{?{7>(ulIUdN_QQnU(+cupbD)sidVnPn9yo|1}W*!wH6<K`JF-QB-^@
z`1`|dyp3V#Y5D;g<=>8<ogXkPn~*A_N`HG;aZlJ^42<Wkie4mtP0{~wW`9i|kt{GQ
zB;5e(Ro@5K@P8V^Boy(R<!qt0o-2U|n!mpu1;MqKqlL|5yIAcuPtHMjG@tux)h1HI
zK352B_`jb~5>zl&hn;%o6HueLr;CQ^-(P?ZDlo4tU8RA4KW;H$z_^j#E}M`3{jzfv
z0p{WSeLY6OKR*utS{wiN84;!frav6yQpl?Mx65P${cUwpgEW%<c6FjOzD<|?YWl3!
z-%e|Y>f356x3$p!`_bPb0T%Iqmy3qO|33y#IE~>v)8{pRv)?BRfS$PRzsmKn@&k`Q
zt@k(FvuJJgu5kDqt`kkNM--TJI(a~Pb<8{=^H|3C?&V|&r$d=Vs$JbWhlg4eHbu>Y
zYe^k>vnZtZ<1zcgP4nqqrq2sp3dCYhzEq-L-zK+U3efz_YKC)M>?fFuY8NgbxVLNg
z5Hfdx;PZTqNvm1uXVStH3$TdeNh-ugkM%z1P^GICj7N_xD{Be-89pz5P*<DD%sSOA
z*83l(8!UFaZ;xMtcE&tt$LzGLt+fa0FcR-x04kQnt8X}<Qb_{!%&)_Er+$Kbvb-Pq
z84$T%0iVHuIxIAxFs3x%d=xkbe9JO<Ijp_Qenk}4IXFmwxw!sGE`N`MaStmT&VU-O
z3T)d`YL3Aa88u$Sk-F%7TPVO!=X*BXYKK}vvs`RRuJd_mr2SMS>vFUx0X#)kQ}J4E
zeSQEQu@AYjGqV%}8CWRl%c=fYxR<>NRKEam#sx4JJi<|NG$K)d+|zE3mJjb=@}g0s
zk1pb=h4`of2!(<ovVgQ&&HXdJNCMt(+Wps|)rkF0Zr1m}2sivu-7H7+tv2;32$EyX
z)w;h>To9xFYRyNd^Wdv%Qao3dyOBSvu>U?vVdV+Rd?snP%)onhBvmXhEHRqJjYB3(
z#cFRi)2HqNuCV+87?L8T`}dA62_*q=McIb^QBqi4$L)Zt&-CXWV<-s<<+G=>u3T2-
z5bgNN^MhiPj+fbPqV8bNFs+wmf4N%aPt3!eKIb|WZ|i&AmOJk2eg5b+e&>-UitLO}
z1$J{|!Sp&+0niB8l4{c6y?@dSpLzyaj29gbuet6U8hTuI%jnH#9LA>;b$zVj30En&
zMElq`=5z=azY*-zcc}Wh*k5RO+Mj?*o6q=LAFghtE`U`v<?nX+HTgX{rH!xP3_!y0
zXKys0(%x*QA|TfSK^WA8uq1@#+KKPcsfl^?i=E{Fzr=2-Yg)s(JIV>LyAplDqxBsy
zN=Ysd42X=kGR)d!FP4_+T%7&vE8lRB*+U7z(%+c3L)$~tne_C&`ut$K>7K%DL2k9w
zmY%Uc&-k@9F41(fpnkQL$y`baIkAHfx;V+{fE`8OEE4ZFSa~VK>tTuw1KdWdIi+p!
zoO8S3)%zQ$hV%JqdZFS%VkC6V1|%*N6a{-9^7^>?y0_NGJ5%hzE&;P{eXG-IzN<PH
zMTwxUAE5+k0Ot!Z*Hi7UPg30VHu<@$1X(BA&GNu*^xjzvsRB3#g+{HF7$WXWp^dW-
znc>=1qHcELo#|fs?Mf~nmYM{3-EHm%qhWF`hZ~B6+3G|I!SyR;pvyKVb-J+rgq5N|
zSLh|vyJ6vhce*Y@BB%e;4*`Y!-E_nF#JOiW_$`>5Q!!u{t1#83ZDE);MHDqe84~*H
zklIk_!Eyi7YD0<xjphYG>*bChC28AzgCsg(JY$tsGE?ObZI0x9ENmo=IeJ=_{C=4<
zapnS<d<YL0#YfW&j5hd-K>3=p(Rwog42t9RU~4zxc0S1g0^KL`&Nuhwvu)sbK0po&
ziG|lpJv-jepnK$n^s-|U{8pRH`ML8H2tSh}quIpUecXlYU1JV`&*vGy<-9L>mmr|y
z+RFC~Q{j9Dt-<ih%4L6stMPQBN%P$^!el(1!p*g)Sd(C2KnjOa{`CVrIL+4^k*;#t
zXVJFc_a85cEiP3`y*TZ+&npkgbX^<9yG<HODDfPcY}a$5RKfBxpu#XN?Myc<7IX4X
zui00kM=)Ej>&m~x7VzoBpP^GLi%t>qo<RI@uZlb^^Eizq+;BNvAYJc#p*wHJeFk0f
z%^JkCG&!HH8=`4a-*?)y$6tI$wQzVaY2#Yz$B*d={!z^YYkTo$PFHFf8K2LxH-pzh
z<zg3Ic2)DVh4i#Xcq4t(>)5H0Wt`vP#**-?)8@|sYg<Gr+u-vr)8-p|&k{sKYh*W;
z`?qZap4SGrSWlXX*oVrS2&Yr-)3)7W?RH<S>fiM}?XR6ox`6dGugp~p*bBM<i$zLr
zzB}U~6^lAo_AUvE7yE^#VwmVsXaDCLH8wmsS-7fKNkyI~ljn?CC(bdhmB));AUMq8
zNwLHK!`@qlMY%70+e#zd%}@eLcej8vN_WE`4bt5WA|OMjqS6i0-QC^Y9Rv7WthM%f
z_TKNk_kFzIpKtR8W{xZWasJNV)?LPT6^)ZNPuQcDvz_Z@Iqe0Lxzbk5WAbZ?2D5$c
zCMA*i4>?}n^s*MS8S{u(-NG)g)N?TDhPWFxb7S+obBY2+y0oip=x(_!m)b&I?2wi@
z<`lb`7iq|^Lq0fd30I2Xr^pIO)x*(-q+PPDddLeRmC97u^^~qQkqI<cl1Ee_Z>Abn
zyhSGAHqJX91vu&_ls9p!^Asm<5c14Ri4f<YgIUaLdni3V%l__lrGS@bp(KrTlupP@
zkiwmvQ<1bJ0P?;UvFi#qyeG`)j_zhR!5a(7Ic^d#jW$~eXJz)c^R70bTkhwuxLpK5
z-pjcvYy@E#FS4UnjP?G`$anWt*g?dRn6V9Upsp4%iI7`22T?ci4tL5E^AII*;g$hK
z+uQZJ*70mF!(cXC?-zS$ZRe#fRj~jeiq;1M;y(OJGzBD4IB;z}oD52Hy%#zKfR)uC
zjE9#!d`G{k;4d0iE?aRR+g!up_pB`Lwg-`Ia#@9~?IE`ZUa!aCs4s5gzsXnKn4T0$
z$B@Qtgfwf``!rq_15?dbi_Gbn+mq`hgVzl>w&)f^{#Oz19a|V8367`J4Q5tEHS#S_
ztH*~Z!=bJuQL!910V2{(-@IWhSa4t93@G`rZArLw^SVBcXb_3(p^Aj_J!4r(rniiR
z)U2lSZ@*K|c?LYN5uUw#j*x92p;Lu32XmKa*kKE1TtYpM&1i&b$*_S41#BCfqj4@6
zzycvH{r<}sQec4{I;Gz?xrrM<G3u96;Ifm2kd$<{S<A!<FDMd<3=poVYp}Sx$(XPh
z4CBzjVC5#yWcldv_kYs{(!Ke1%?~k4I%mQF-Q!QpV=emXecoHlG5ta*#{U_f{cr}a
zZ*y~!8@tnqjPnC{0KFs|yWSa>-doPqA@HEEP<-8AU<tsp*`6F0=(oCoGih#navVE9
zx}w1+EsvFvRW&I!whF8Y>sx)aP71G_e<pBVPijrniouS)n6Gq~>#6+&?`WE_P}gZ{
zd-&^8eJt8K#T%lb5P{2V91PFTHr(vfPD{-85oxnOJ}K+h;{SrqdWp9LZ7uues@o}J
z<e7v>{e&|}4Fh$Ulm9zCYY{`r>D_1g3+Nn&RTBXN{A_&s6F+O2<@_nM1OvqPA^7EH
z{G5`YqS|YXoR5~{vh&Pp<v}g2IEdl`%$ixx6w-P9fl_420h$WrA^1KBB`lf=8iR7Y
z6q@DCikKrh4j9a^<$Gg9%yR{gTri%My=gI88QHlYeQ`LRN<2|QSK#0_ygEi8;PSqm
zR$&wRL%k{jq9Za{7;G9?{@Zol#=31J!0=CaR!ryMTEr)RlsZ%{^~L0tJ;s{Oxob;+
z2QfdGLakU;hPAaX|NWTgGm{znBODaG0Xv}+=}fQJ+wSKZanu9)<U>iZpXr*}twrwb
zg^37^exZUQZ!#RW2E<tFU#S-IuGY6`y7x2Hyd~F%z6M)jnAT89AfRV4UN2cq_NJxr
zo#QOJcawDl$gu^65MikIC4T0HiJKTsb)X^?3E?4%=oW7vDzGPyII2BIC%XRh4G`Zt
zX0V&d@;j{MWum}zm@wAaKf)+Vrkd?v4O^ZCopKuAu&%ObeW!7-f57^{7@+lPKu0c#
zQO3l;Ed#wAH0540oaEVf-+Ju?I3YUwV6?dj?bwcd8a-<kj=qT8jBMG=jR3AqFwOKs
zZ?_fJ0L&qvFDHgp*YAEy1-I3f!9}O-Rb^oE(3wa@XVKkcoW|2e;$a`5@H93uZu9BV
zRe=}P8D+DIrW<xuDh)?dIl(jb7Vqm^tWb75joS1s;-ta!=dV|uciI02hY=82a;HLS
zY-Zoz{Fbz9Vj>oz6`nX1;#LSjK|bj%Y2ehmyT2-|LTA613{}+oerFPkyzFpuLiu6n
zh$-q-h0yW&LJi2+hYWWpjangv^CQ&ecdnAigZxqOepFTAC7f_4SWb-Lc1ix-`4^S^
z;Zn09C9zvxQ^&aZo`G4X?-tibs?GX(9^Q{60dcr1XWQ`c{YfmNHZ@DHmr1r%Z^S&w
zot|+Ni`An#)ldbajH3*T(0zHPpi@l+*$7RMu2YTu81+$N=dL?<F{gR(dPKq}vDFLC
zryvlkP=vflIRg;9_^5q8+w42_stm<LZ?Yc$g0UMX9$Z+;aj%zUBQ}m{Yh@qQtR*4s
zwgmCt&R!`)6-wZVz<4u%*|9G<>^@q<y?%x2Q>q7z>5!F8__A$8QS&Rs-RZkvLmu03
zRyV-x*J^uBGYF}7dR&7jKD)d<IQ1;o_Xa^R!{XGbGP;`rx<NNwSQw=5>8`2_7h@;c
zCY)Ya4f$sgg<0V%Yg(4Rj$w6q%Vebae1A5-84q*-Hy2eq-1I&QPc*1Vr&+9W-#!M9
zN5b>^FiDLcyaXS~VJSgY*7trroUVvQGufXOzo(Ep<t;SgySq<ebztd-Fb~2(g&?tY
zl!Bl7JsY8kSP~;}5*bqSIIBQ<uA&m0r4{a0k55@SRoZ8gZ_kO7y%{DrDs>2_Tl%^L
zH@IcPa=hr;koCFkNKZMAx}QRQ=N`wHA=e^iCBd&J1XeQsS%D9=CN-w%T=4BT%n&yV
zJ|v7c-_VX{>y^}V=@in<Uz&vIpRpHgkI)Bp1#4|~$qK3Szq6CDwLxciaFbfn>|0pl
z&;}#1UY1c^(u`93HIXhq=L|;`t7xlR8^rOn6o5A0c7@_uRZ=g6Is~O3K>xsBr34nN
z41yCqyf_+qBXMKB4pYK$XqDo;vV#L1vj+V+rPgOAwEoyT-Q6DK#fkzst(Tpc9|EIQ
zo@l#`z|H=myb)BV57^Xbl~tlEa#_jXFuKpe;7-4VE~+TGn@2P;68L+<I|LRu8*{L1
zpfrNfK>PWcCt}?uhA{5%*`jMGjnHFDB;^43z^I|a%@*mNV+~CSU5;2c^^sxWf_7O{
z8oWti2Nov|*vcDaFORL)pMyCox8hwDqDFOsHH>wF-nW23L`4oNp`y|!%samod{NPy
zayxBrP+VM%7C&sk0I`qHtO6i(MhBao{WREJtx|@$C|>C$aHbpc08`6(mN(=IO?att
zQPhj{q_gqr3UHM=Jp*3H@!rsDxg{yj@r}@G>poU>ZAibNtz{2bTWp+z*6Ae~HChhx
zwHpwGwWNda%pp%Xf&2ZDoTMvtavS$1<UcM2x+9N~M2mN{zehJ(IW`P7<qV4EqT(#Y
z+M-!#M-k6(6VmLY6v}{Q2gV!U(pZ}B`jV@31@`w|Vz5}k;D7TEf8n1HM@2`*wdJ1j
zy9?&04pio~-p>TEp`mocR?+-O&TR>fx_b%Tp$O*JWYvZpYIUx1i+(BBW$z?s8^srR
zjE0Io@<vDaZ<rS2*mCh~6|>xVudAj9zLF9&U(%MyOC4gW<+!mi95pd&AvG@^=%5Ho
zWUYqb)!;FxzY9Lw{O)ntUC1upsYQgq=N?cspzrR?ZItv1yi#P&)OqW)4JUI5LKTcE
z$<ncl9{bYU=FFWC$98i%Vd=E%@y3T}rsPen=`_{A@SM;IwL_7q*RbJB@`9QWX5IV1
za`W2qa?`fUomf#HefSI68HYo;sTP~|0%*v4KvnJCR?O1ODOUc|;&(>rYqx_?P@!85
zP9a3&=M{uzzi+M}1oC{##W&@yoCRA#b2rfC*&k%<DnPal<e^M$T^%jb$<WGYSbxT5
z!R~SYPUczE;qm!9rk^@RrL5z2-^-MrTdTJ9hw7vJJkz{6!_U9|1-J4^{{goW;pkL@
znY)uLhH}0Prp6^Ty{La2Oc6@zo}7N8qRH-j|IdUg+h2t156k$jm<f*`NSH4>e8`Vq
z5{kIAqw(}up>&#bUV0YwYe(5beEq7HvdZ-1-FY=j{KTUrA7=Yt*T=?fv~|>ixMgSA
zzcMY2Mj$!D*Z{?R<JfmG>vW2koO2^J_q_rE#-uL&z9QGuE5k?IO>N4%V_ye;I1luv
zP7(b&{6a7Cx$8BZ7H~|*%Q1UsvhBxpc=#3F%9QsIBXK24g<FPYc5r}vTx;=a0*jq~
z-F7#JNO|YLvSX8BE7e_CQM~BL9IO~eUq%TW|Km@!qSwS;ow7;U-BuHZk4;pGjPDj$
z)QCu`A&4zWBq)=bI%O#azaY8^TemM#)B6=SK;CEJmEGRUrt1p$HShN(e&l%*Hrwg<
zm(^f+MKbQ3d@LZ4Pb11c5=j#jJKr9ptYiYawc3&T{GOR3Jd%KZAK*3E*OlphOLvES
z%x_b<tPFlNzZ<*!JL`hS;E*8&PYS#;lHGP~+MhD%BiG^t=5Nox1aMyZeeY{6ONw^z
zVtMe_S)~282*KrN%OV~f_P>(%;RnqHvWMZnZu9ZXG1KFgBt_+nahgP1aqw6U;Oo;{
zJh#G;7++BI-}~c+JJ@`Jc2@3WR|Jjw7+f#*7V1>LzL#|P@WEEjQ$^b$?XEM1OtoPK
zFFPeXc6goX-al{znaH@T!mQZ1&2Y<-A13Ild4-}fYql60zG4b;7R^TU{7mJUX=Mvq
z0Iq<MxW!qDh%5K$9$hO<-R~93Go5Nvwt{m4<5hzbUQZwYAryN$ICC?yR9@48Pw#-D
z+5Z(wEry`+kGt3X38qr=ngQw69V7F**AO`&1hJM!AvhUh_}?T#S>NhbBS!G!EFp1R
zhvd^z(V*?D4L1AW7KVw1Yt@K8`a6+|L3lChCCpqy|G&`GwZG9+Uzzuw8HUenx2roC
z?uM){__3({xEHo|uc*?xXicQD{H43TDR%qv?p|N8#={wn+1>$RrfQbrR#HUT@W0X2
zHa7@gQ<-758oBOAzuRK<uk4d+$VKrd(SAfB?wY?oTBeVBq<1`_;r~RYCO(m=di*X~
zIg&5N&k_rEqJvQ+KXG_;ux5l+SK5}vl9)5HubaL`^o6{Rl618eJuz)WFor=AQ^O}e
z*R0U$yZrq7YctC!Bx&v^o#Mg6=Wrx${S>!twuOr!=pYyE_Rw82@Al{kzO4=A*F*P1
zK}39#KgrSD_}vWopiU=fPJV^16Pgh<x1;9SBVyaQ2^&a!xK2=t&~b{%EFytD268jQ
z+jfrwC&s>o&+$z$yZ-2x$`;{D-O5~d|2gE>3baBc>&hs$CeyREBO`<BU3gIk7`|ST
z-ByG`9!~7UW70K2M0Uop=;>{8xI;JKWX+7|v=%?=mGEQpbe)c#rT)SZ`-Yw#$FRxH
zEw^2|IX`n|14L5CX*tp3SYPchZoW5J1Bx~#z))k*xa$9jXjg1OcjBJ-TD4R@y`qe2
zj4f0Sa7yPUSeKjE#U9mdlh+9WcLKnl%Ba*DiWSf&s5Dt?-Q7o$sS_#&Ois#?>yO>w
zeIp3R_7B6PO;Kz7I9h<TrI4rK8K0KYTZ`OGE?uFGn#oBWijnxbq28^mkaT80FcD5Y
zmKMwu5cQkLNu-V0AOf3tr2&9U&)VzK9ffZEbbT%_ZGS<F=?}YswP-a?ZfI#ULgB_^
zpSb4D$CfN^j}4^P20z%L!`f{O*tY>)Vk0|J5Zo4w#RxSkh(iy>_K9;6<_^G@&cS&R
zwKfy(KuwmrEhmo}dHSSLQ+tp&g4<fGjYj;RiSE9DNn+6(Wv#{D^Kp5VLey=D6mp3|
zj*3<h(o2ZY-Nv0sbHw*6pV#@|`wjc<OdSBZR|S-t56V{Ec+Qns5zHn(=jJ;*AKlHU
z@Qr8~ZBm}AwlzCULjk!p-=8rox~)e)hF`uOrF!;T2xuubTh2rTZ<c@6u+-f)jn4Zq
z{w-2@L_Xw8i6&*d_l>T6YtPDjPr|E+a86Ng=pv=oiN92>rBe3Y>!YPtcqrR+^2x^K
zpVnoMmLCn#NRV(4%ZIfv)ko1<lQ0OK@MwtNd_H1(e|M{VKcm1SdD+<NRygBUZ>hE8
zbFxySu5N8?nZ6!Gc2corDyGF4`WI=c^X@Ow6#04YKS)z0B*lM_rp*B`x}|)XjPDDz
z%H;7|C^1R!4gr_xW1W)_eberVJW;X|Tx*G|I3*Z8Mu<B@@swuqWW;mk=l1z<bC!A`
zmnj{O&wK{=NL^hsyAEor5)sldtl6^jcDldqDGF??gLId`OpT$&ZP^b#v=w2(Ymxw$
z^kiwn)t!}^<@Xv_%SXWnPBF^6t=k$#HbT^@V)G4w55HP9Xu6W5`;#!71&D;1O{rqz
z^jX=oS|j!>uLuwg4qci%z}+neWgpZY-1zg9x&yu`pVQaWeU6DkcnI?T3j{UyJ|B6n
zTVY!?V3p3*y(WxrYcW`793JXU9>#N*9dfxJ$%=K}oUdJ(cf`nP4Ip40sUia!Kc~LG
z)Ly^bn__FFB49n21Xmeb-$5SClbF@skDYwebsYG@g<|xsCwjU+L#RG4RJTR(06WyX
z&LA0RvM5LbCM#u0CEMXR{u^;8nwqx(chI%#q_J}ogFJxWf#{n^Hy-56)5>&4;x!({
z`pe8k9yAaTC*T#R=cXu?#~;2kF?HN5Buy(iJF2JzR?+u*83<xa*O%p!bJU5+4dWw$
z$o&XirVs5mxFOF$F3uI!@7G(<vY*ekiwrs0l3~faW61JhBqJiiQU93$?Oi^+c{7+&
zo;*<B;2&kAzIYTK>ozW<XiB#^GZNq`ju+op%Sgh|X1^k2yY@!A(Ip)z42nTK&{h;+
ztWTa~CjEYi&&|G_R#r~4e)V8BQ%H?$juL`vRN4)NGinJLw=F~sxCIR<ZW-O_7DzcO
zxqHZ%st<Gv^MtGj{@z7TpHbaq`PpjbXDOY`s?-KGH<04&UX5sxBcer9Cpi6s1vP3X
z@Op}1<X8wuCVxu#=pny=)RKl3DMaS(R4fsS-h1Q61{0~2Wj?yqez2~V7HA@lOn}46
z!#$B}r9nhzkbL>titUkNnOMG_xcrI{-yaAdaTug13e?zKLMeZ5OULGD0ph}U@aNkE
z)0hXx>I-_Ja`34_g5*rKte2xEV-imx-iCC;dk6`DZ`<L|u~^Nv1wO=K)4BtERcU!}
z=?@@fQ!YcsZ4i(PUGI+7fua648dS<%{W&w8YxyS>oJ}-<v+4Rp-Uz)}T>T=#P1RaE
zmBEA13PC{o-Slt=-E<IQi8!l8X%&UWr04nUSgmsJjz1%*>a1ikRB4utRbLszhhlSX
zoD_SWx{rSTsud5-VL$~I<MXi1`d;`x6C=<i@|SL<bq<*cr7+^aI>CgXP&DNYqJ5Ux
z7W7DWf<XI{7#N6mMxui3*+^K*#n;KsD@4r?YFRKr7H9QSxdif`c$#vr()n>J8&{5Y
zN3z?`I07`S*Z7|Nf^QLEX0!%$@L#J3xSZuU$ZyH=OX=^YMlVA>&bspF-JU;Z?i}H!
zfI}#W?exZkvLvC&+oQK;#q+cY-v5w_eo1<dhp)CjVHdWMqZZ@Sc|U^R8fwIW*TaGt
zbIT{6U8ZHFrvDvYs7?0G%@7cTrd;OOx-l^zRU*sumSV(->x!NZ3}M}ibTpI<^JrwX
z6|AS3k(w-!tP{2`_~RZE^G?L(fx*g74%1+bb)`8H0f!wVL@d4+!(>%-b13|3L+L+#
ziqVt<r1ArUtwAa;^!dQbHOyRYd#{>_jYc~6$bjbh?|UYQMkX*pdIhJe)9XXM5E`hN
zVLgP~ncaDRR7YJD6Lobmhew});{_|ra5V9&dYhbhxYm#5SQu%TUI>x#LFA7s+i;IY
zLyunSE@CLok?!2G_`HdDAhL8p_fJ7X^RP65T|>M}b(pZv2pLu$RM%-4vli)H1^UIa
zrdcgcul6l6ZKao41808P2p}`J&|=T{vt=J}+Li$Oa-pvT;kL;jWh4{LnMeu^6Y#<)
zWfj&{DY4_PrtrN|T-Vzb0|*Hz(}GGJ=JLW#Z}*T#m_O46E~($ZjgvvJ0ZE7!f|(Z&
zi$bdGI<0(Gl9nN?ICKyHP=Hx^-r35)Ugu=04Zh*ma^{E080{Z#npuPUm1X&n3^i0R
zAtD0K#;a?hkNP8u=H;vy>l6Y_Nxf9+O-5bTNW}Poh!+H{<UyWU2n~19i;`%aQU>q_
zwwSgTWUOI#?^Tz8@}_DRT^_Stwp~h0nTG}vk*7poG*4u3;GtmL;X-4mX{a4p5F0FQ
zW`({_IG{FCFng_<Z~}nKV{fl!;R7aRzn3Aor}Mghc5_0OiS>ao9~p7E5I<S33U;40
zFVdDfbFtCx)wJs3sJREUrB?$<KU|xiU%W~87&FKQwTl;Ii>|+1lP980UnOf-uv-MJ
zz@>b&Yhud`_ISwic?%QOHB(u1SQ9*tKfpl~>5|Klt`Javwew2Al-i)O(G{PmG6?pn
zJZ<~I<J!FSC>Gz>&(pv{WYeW$MYz$k&vteM98kTeo53+#MP4$|ohkMLSkm`MjF?LJ
zsa#yBrf`EFL<K*WM`_#Uv$3&$Ph~^kat<7Bwft;Ww&94>EH(rqtR63E-V(J61f$i1
zWEUHE2Pord*XJBG1WLv^QNAlaj9*hvFEbc}fZy#oh+EY0n$OJ!&RKe3V7F|gbZ$NM
zTll;#z8`4X>KX)jeCH}O^s~d9s7-DTIdAp$?68?dI%+rK1ILDL)y<L-9}+~?q}RLD
zG`S-kd%7H(;~fT1@Asn&ZVyS`t^mvH^SL1yxx@#a1RGzAZQUtAES8@XU!78YikhtT
zj=OOCU7B)gNlwas5<#juVG@{iYcv2`%q8-jS;7kYTh*3C=LhCW+*L0;VVu%6F3%&u
zxx3TQwf-R2?)p9uRiCu}$*A%20JUo43Ure_zx(RYA8G5EVcJ1dsEkEyAQ1|uuml!@
z;dpO|EWtQ0*j9X>k<iRI6YB}r%!)xbNY#8-&4KSf_A5A8B?+aW&ujJGy+1!#XGw;Q
z{1BC4P9MQM5MzmMK=g3y;zrSp$F=`=v<%4};ZQwlBF37b>_;4oq1Bsg$9|lx!6#sI
z{((4?6LTf~`K&Hm{PMM-uOnoS)P$C|zwsJD&pI-?>BTy3P{a&tD@1vIKYLq-z&~hd
zSfXbo7}QSkalOo#zj#GQz8COydP@MrHfCmhtJ{*Y36?z9?=`mRBTp#(HOLB(i_h`4
za#UI2ynGXcF@gK)Gh&fW)_poru8$@Jr6Z7kylaZ9Qpnc<ax;6*{Ju8K>@3280Hs;q
z&9<i6kcE1L??l|aOHGp329a0=!qYOHN(oMHvv%yBQbL}~Ya#Zs<2BtP9r3uBV{YqR
z&O_?}>|0Q_Z;+YuGY@oBR#01x9>vmy_BQa$_V9+C<{_&@FdFoaet+O3Rf%#BQxy)G
zB7C?@=CH^cF!ZM81lt+IQ;Zw%_FuH^XCLOU00W$TE{0mS-gdzD_HfDXK!M4Y5{_WY
zs_P<MM|<x98of1?EIORdoXh@)3!vgqavoE@cZ&%WL90%*RM#sLhS60^QDW#Y<YkX$
zBt`Ep1;jGRU2o4sD!n@UL?tziX3$uLgTfKrt{aq{TSEx4gTf6_v(t%4a254hO6QZW
zX6QxAHn9}vxrdpBS8vkike6jjpHF2B`H&CQS1uFm2riVvZmy!LQ0CxZHKGSSgS08b
zWp@N(yt$o3`@+l(%y}<6K2kDkK5z<_W4+yZXh@HJqbD&r9o-$VTf5f9F%vB9uPq5Y
zXi)60BzYHnLtL@ZI}IqKv1otYilQZm9M!0NzndNJ+`8><m<J>*(t+l;BKX8azui}(
z>VfGY*nbqOr9Va*I7w|AhA?KP7{a|n<MorB9jScMyoYTIBtvYNAcCF8nGPD|NB=fI
zvG06=WYhLfZ&Cztfz@C!2tinhczp{CGt$E^o#i9$<*TRTKR16|B^dcMj;#Aqewh&z
zE81a6+XbIjmYodVgEt5jXukjaxP{jtcz}_QEuh<F2ovy)%<Jof#ks}He<7v(Z=^+U
z%Wk@L<)XadN?Xym*V?{?jXbLUO|Grs0ALsUT-hu6sNtl1v;PjCora5^-7w1c|NYnh
zGmhuq^8}y8L60i8{yW~)@Pv1{1?!-w{8#vliU9bHQb+aUDgT|xJdJo_3h`{2<o`Q-
zruzh+)g3EZ75`T@u7lx;jnk;DV*Qsx`tP(Sh2s;sx6vd}RQKQ6xRNI}PJeZ(?SDRM
zfBoP8UkC3^2y_ZqWYrk?-+m)D)}O@V+rru@t^ay-l;OgThBJhg4Bxuko^8^w$lq^H
z8|=nJ5v`+rWcpti(Km1t1If#Cbxzw<r;C>9|Bq1o`&yy+e|YEr_g)j-pu%)$>kxUo
ziErYw_w)I^9@`@+gf)E+OhW*p&#|V`&C=S-k1H!;`}}2@y0va`J~yjxjC}`qJ%uxe
zr7N}u>WXc;W<%BwjD*b7M)uar=295Zv2HGb3$|w22Suw5$e8XPH~&UYmZHY<-}nC$
zaa{P{5yy*=6HNPdqC>;KAjeg+1GdZS?wC;$6#NkRXqQ@x<E2lQ*j5{WKG67rtm6_I
ziMGH-AVvaPlC=@jvg!Lsc`~;%-8k|?C12N_mF2cWl9*@d800gBL#GHZWRD~SomLq#
znEqi{v>7748FND88RTB{AOQJ6*!0*jB7V3>`3NlJzqy>_wpo&&7x^t{oC#!|(>dLC
zSPLH8rDAf1t8FWaaqfD7vT?e$5x}?e9;jmy(zwRJw*?Y|$>crfm_qN%afQ_PPigwU
z!^7GvKiYwB`x0AoUXn%)e=?#514YZ~{7Ct+{|Re!UDlCzwdnPjv^)R8q#gqDxo5*;
z(fe|?!Sc55Y%|{xj{%3aI3&ss_CX^%r}LH75aR9+d?w^57VxN!ok3#rD~`{1mgfx~
z@@`0PWJN$K8!*vow(eI))0sdHQ37~{MKBvQ9efd-TGuGl?2UAGGI_boA@V@efb9l)
z=?k}D-{f}cx;O4iA+|$6P?_mowDf=&gc}u=${NmXbIZyc5j!nd@|7$mlQ;RBe5%jj
zWz_z)>mXjm;?4PN#aNdjfVD7P%qdZO+XBYZQSviZ5w%}!xvI5p$l47TfmyxH0w2T3
z|G_3|0&F7A1`Y+wJ>FgaJCh6?VN_{}?idh8PpZ)GHC~yqI-P63)0Jfkz=HAc2^qii
z4wdljwfV4^q-;Xo!0b_eoGv2Ldof$H+R1n9qs=toqJ3}O_gscu@|0?Pq}Blx;cVL%
zBK)|a5|9<qOhG);Y>+1yVkKmd!IgS6=*5EPy@54dnL^=$<@ZeqNQ{Dyn;sIVyB``p
zoM&#scwFtu&1)hxI2?X>VoWSIcGxCrEuEV5C5Sl8Ptuf(6>ELgrBbf%_Ejr9F0*29
zn}0vVq94smSla!P856CYZ`80$5q5rIHkiV-<4v56^oAcsgzenmAdZUD!3*~soUvUj
zaGao0$jxItmhj8<Q2bE4=JTWF-o@_IjOt?WsAT7hj%_xuLKV*Hb5v0AUH-@C3ml~z
zPy6MGVsAw4$4Bg+TDkmqEb0@cOx;!cQ~jDA246fM+AhVhYVqMd%ZeP67Bm=MGV6R0
z<oQ5TWxX^YcLtY^fZ|1%`owDRNXAYh|7z2&dlTMLRoQMPw<MRK*|90{a;olCCC?MI
zc)mT9og)d#u32&q<##y}Pd;jjjKI6xP8>3*6=j;M{EFzfm6twWGX~u6iZ&TOU!E|;
zlELJ|J_9kFTE1a6j)e?=-<o(m#IyX&p}u`6fMUviVFVV8?97_bIo(#C?Tr={;Kjsm
zcQy5#PkKM805z5{cBO@`1#RZ5)2B-GC1=Z*qz$|qQ};N*Z?%pZle9Ul_OwHE$lis#
zAA3VV#fjtlxJiITQ*AXK0Wdkz7{Vvdsw^izzgGV)$};T1)%u;-r0VB!i&#R5p^<(S
zJ#zu>pmsI3niD4rSd+`^CsFGK!Q4y0Cs>OSG#P^-au<frs7-%rON*qbK-fN)JfbM3
zzvyv+J89M%1uP#O3B1D4e*LviBTK7dTazI%du`ysKjz7o<}Bc~5r+2wn4jeyUC@Xb
z#$1duK<0%lh|QB$Q`qVa$|4izyXa)S`?58?c0k;7<F@P<921$L6%S=V7Y4sPrhG9L
z#vl(EBS*f0T7IIeE%}uS?bDP__2(O8ok59Aaf#Ijj4z`*tL?8`i*-LrcSUgmdu#mt
zwSej|LjB#2F1G6d5`MAbX5xxUz9Pz_9iuJ{MpU%`Ff;Thw-wTIZucyh&%!Fws#jD$
zl>5TOvU^u9g*X>R+2FYlcs(6~=vuFyXi0FcpUGlIIoLueVRkvR02;!YGo&Z6!3*LJ
znS8r}QpDm}-+di5*`=sgFDH0}Hn}A)H1g<iaYTJcc!&+5f5`+lRNAWpB;b>K`}eNv
zPrFFq!{X_3D^d0*e<;)}x{#Rl*b7W<I$_>1kDRWHAjNAJ$(qTgau*@3fC{t|(MfsV
zVh-dhB@MA01DdDonq_Z|Ad>3^r`U~b--wnfs;6}gB03;9-qP}fDkhTXvu46BKPMJ(
z_?*@b|Gbb$--vul5x8flAw~gNr!Y<E_vxzyZ2nxh9~i}nH=^%Z)zQfL)0jIybJH-Q
zvW6oF>DLrT={dIgW_Ov#)6R@O3YMa_zFAhx27G?d1r!s>afD%%CF8rOq)nIh8u5Z2
zO#E23V!+cn3kNUQhKtEdOpfOT5XlC&U!B{DjzJAfd%@d)yaBx#y2#vO7pT<v0G4jo
zii4SZ8*+krNz5I0gI_c^(yKzJkU-k1;g2Xz0<+u*z`*xwgEcFOJQzHU21d`BPZ&p9
z*qxGH4h3E)*?u;TF}!iEv`D~sF^N9d=pU1t>6_7|0HJ^PM^e>dFP1ytnj>f%#vDC>
zk9f~^i)-cbBn^_}mOVLuk~_GvNS(<fD9wrcQ#ZcLr$f{3#vG|7<|M$6GF~Ig74v9e
zVuJ`-dH{z)=ez3ZyI*|X9{S&Hm?D3Di=NFi+OM|x2-q7u01zD5HL9oYez+e0zS<6c
z=u&_y65@IMaGlY}SZO82AncZT)C9HloVqw?5pkN)OAylG79mG#AV)fv6|i%YNa3us
zzJ0MbU7v&mT67cH@DI+^&`*&92)8frSwKk++i-HUp3{RLc$$4-k3M<F;4^T02ZblG
zz;h$cmRiO)RK}0e>P|z^Tn<Mg@hh@Qb&`fA>ZLJKxJ^PnvTyrDzV~lxDImtgZ|Vuu
z%)c6z<p+S+{8-gS=^EzpwRhF>M8s^?NLKXb=(lg5Ajj7zV;LV7!ZdL2#*0guw$x&$
z%C%t<w}DW>lOhzmQk9qo+yc9Rguh^e<!@a256Tg_O#)m&IM)L!#6Lq790|$tlRm!(
z3eR$|Btn&&1R0Txv*nYHgnl@MqH|{1fp$q7Q&u7OUl{dd8$Dg}f2zI4@=E?o4l3RJ
zTYnD@f&4TOQubk~@|E+^L|pZm>|mRu-Mb-DKJZ;aZ;lq>$@2g>@@Q_g-gJSa#I>zd
zJM%)vQD9nJ1R%-pq2b*>WH)6KV&F-^^Iv79kgYI$piJ3h1dPgl-xSkCfvVITYG6aZ
z*3F;YyhpS>1QmY<uvorrN5J^(gcOjC7595J)N)~lfCjbAcs+F-;2Y(Z@|(Op`e8cK
z+)H{r(XYw8%1rXXfvZn^;|n9a`yb{^G$aWRBIG+H=Unno(R{5Hj)zT+nf(1@PLB<H
zOO@<7(jzf1t4H*vyF07D0Y?Y+`?m!NZdiVzm=D|#FZs9j9UF7XtuVJ{UW9_ywUv1C
z;LsEIrcm<FE+HRd9Y`cWyeS%4t2%`7@(?F3t2^A6N<NqN%^xm_E$hiLyt{C7r6pHV
zu)^|umZET4Z`k}W|D9q)h<r{63+O*zMK-VG9|V<cwB4d$bAIf2JNN6sDMipoFnUFX
z;igkY<(q8Uyk<}@YoRNvKWRU{5Td&S5zh7uMO5SATy<Qb$Hzli1!AtxPJnDlbrut1
zWF5RuHBkd+rQD9|hRiY%umC(N1kCQ_D);+Yc7wMwbNd)c{b+~w?HY_ui_FC^Z*5F*
z3|4g_al6cvwVNChS2Rl$$$n@!PQiSb$qP3l-U;CdY95?#Su+DgZ@AuUtXFlUv4Z_$
zkQiaOtR}Vb^IM%bZN@Yc=(n1#e&xLpiIp5Av6H91tl!V5*RtmwU`g_^o<k!4?$nJA
z1SwpJa9TWx{<HYgeYDx75n*i{O}HnO)9zJ3s!U;3Y&GH|2F#k_A*&`I?ft|qM^06&
z72qE1hX%eJvvvWERGY~q&og<P+t}8_=GAgrOP&C<*v2g;>s=ke&AgJB606dI)J>NT
z5XUf+OUmJ^Fdu^zl(y`6@v6Br0F7R?Zm41OicOm7eQl{1=w>OU+;5J)BFF=Abx0xI
zjBdipOJ~QfRbm$T9E*N|cNb}RJL{d>HkqsDJB-ai>TQ2reBN7z%Hv|eZTU{`=J=ts
zTbVQXV}$IF>*j;I>DK{PBgPre#SAY$Owus0G%TA;{(N7I2ucV>s-$EL$inanfaZyA
zFj6n@BlM}XW50qqsOcm?Zq9ir2X)-&TZitf$9nrdUM9u9JZq{yYE<$8Qbf_C9Hp26
z+xBeKk3D(J2Ue|pJ0q*@*j=bGL4#h-)w&gaZL;kJD%f2yke7u(2%-r{z37~_Dl+TV
z(qEUwA}}DM4A4B;KAS2mhWq#mtaVPUoAymwt$KJF9KzzW+of+l2%hF8<~+p&V)TCX
za`m-)vR^3siwm23Y>WwtlpytUaUj2!Houa0H0ZBB^s>o3G}H^_XS*%it4fU0IHXZp
zF#!)PxHknzM)tl$v8M6a{5eTG$76VcX#hl`w5EAi#pb%3_#WB<^h(upHhSaTnrT{s
z2T>l{KA&~|2|?Po@C?9UIlYpu1dZEjF^HCW1CnR;ln-21w@)af9tXz`gH~}vY?U$1
zKqbn#0ww0@kYgb#Dczqm<lGss(O!t#;KA%(77`kC`k}Zoel=IB-x-i&d&jE(*pp~P
ztE;LT1o_Yp_b^0m;V+kBQNIMiLGCo;JU-I{T8n1?%0M4j#9RGzj%khkUVg_$&tdX?
z_fgux+ED7IM1cC!IpuAI7%Bz&3|!P*1%c~#u4h7Hgfxfo{0S+6c-Ehz0b4<Q(Q?29
zD%$RP+j_RC{@!a{(7m$QA%OF^0h$rC3O$%>48<G9Q3okGZqC^-wz)3<u);M6Az{{U
z*nt&6Am`u)&tDzB*MQb;a&w2iPU5N!5-~($W+0lwUc%{NvWseTYg!)4tT_T8(9Ec5
zq-V<?ws%`BJXVJ_*LzB0C4Hm*#S5C(4FbfVWmcpNj5Ekv76L9ly8^C9X^C(RUsvN>
z3-B4#y1G;BHP(`2r5+g><C;L*DEN(*ov98Td^~LHK_qszO=KmzMFFsBkYj5gZlcaO
z$2-Sd+U>M{>ko_6K+sfQcdj45FM6m<C4T>0;9RfnNnMDsjCWEs{GeH(*)|}+aV~hi
zbZFuc_Tlcy3VvIPkWFJER&V%xV4-g_7`<-#1QAkE22v>C9Y|F^uG*i5E-XE_J>%!8
zdJ*OqA5W+B0s$Z?Ew>J4$GQ%T0EttO@Vrb*9KnbCfW;ez3lj6801`U+3@V_>z9G2s
z5XjK<R6W$Q!e<RjCPO^udKr#Gj}foeL$AofU=pzu$zV5s7=;t~7j&*8tAg*wCVExR
zd-+wHPgT^>3e<|_3uZ~-7~rQ<ul^Q;is=!3=vM^j89zMLF)*0yUR^b+C~OA@Og`j&
z$kn{kY1i0WknoN{MOlR#E}rY&rNRK=yR}3+7Q;IR^}?JNoBi)D_hPlof+9F>;RTE(
z4-+*kB2|3<Kqmjxfx=$<**0?Z5xl`KuE>rm?0cY2ZS#1!k>o%4(Dt$(-{>*)y!WR~
ztvlpG*#^-Fo&$itX(3hnk6DF8#f64iSRQMke<G8t0?^M~_VXt%O0y?^sHp6||N4fy
zH7HxjPs=Y``UPH&$>`5}TE>Y}s^)&#N%D9aO&QLP03o)?Ba+6wD0_2MGiOgEDCcuL
zMrHaee9cns8rNgtI=i-wUdIj)DPJJ=Jc7!Z@*M%@^UBpJCLK*-$!rJxxOs{>m<8IF
zi5E8IvYinV0p7(%Y7d*qB@v!c3j0&<w+52mL?UXFJZZib{O)_^Vsg8pvcEBDk9r<A
zY}QaUIbMDUaOt~uU%ET;OPmswY@vohF3Xgj3_LaAS+EZj)S@TvU(YpgVy|WM+dX>z
zsF?i+EP1mJ^DjQozL5FA|KtOetaBFq#at+Zc$VUdHN_tEBPaVWBWQT<?z%PAi>6E7
z^XDrVU(Rj2&drC}Wdg~KDB(bdEKwv%BI$f`p4Z2Tm8>R`jSKcm!zqC)D#y(t^=eX<
zQ=I3rue2gx^*1JWCZcmL3_Z9gy4jASyKwwgE`Q^|eu#Ig&S<dY*njp9BdF095(Ab%
zaDQrO>XQDK2d`xw=w^8o8V)2@F&)<|B&r8sJkK<FB7v0+vF%wwvU?(i#F>ilQ+L)U
z1MlJQEx&}uCpwdU4vwfPLTtoC;<x|t4Q%yyq89vjrMjN<cKYG2PzK%7tjfv0$)76G
z-9O(2zrg!0`!(V$y5$1`Zs9`xal~0w6er_M4w-;!?=dK}02ZGyve_&T<q{n;bB5mL
zVqp-Hm%0sI2P#}11~u-maC|uOLnq_XM-ctSq=&5(bZhU`(2qcd<qK6#biFxdkz4GU
zX>3m4z|^T;4rxH{HzzUdv;~v1w(-nIZpmAR)5!>M=WWlHG{{fZdC$I5CR&{s%}C0i
z^MRw1f*?6xrOyikNrw5pz+JnwH$EJxuS6k`%O+?sH1*fZU({lBreS6yU+N-|+n?FG
z7N?{P@{6?wA#`d2br)*17&pqiL`?R96rGBg=E)6P!|6;-u7;l%d`~cMnRLn_lM5(i
zCoFDhLd~Mv@Hcv*;5yrO1a8SedU=_;ax<8hjjtJ<U;n<-Pox;|GrYaEF1Go-e!W5y
zD9{#9CwCNuzE7;_r}*0~WF=v7WHndtwnVd4caIGT<_f5K$7Z-$-tiY0MAHbW4wH}P
zPn<k*hz^fWH%$3h)WJ|Q(4ux$FZ4uG<|15b`9YQ=lrpH2ooz*sr+|`oj)+&)+sG9Y
z^L0#RA-O^*LP{M-7c0?f2SL3?FEa!QRkkJV8eif>@|6tXhR(l5Srye^nssNZ_h{+x
zL`ZFFi5f$yj@x>)*7^ZBQ!S^9^`^S?A-XNncymN)b_}mz8~}mH!M4PyNf(nSJvjq6
zD{LlV)QJMY-&Rk3QW1Xw{E*We3AufIXsuBf6^G(C$CFV1f&IcLV>jczk0hyqc(6|6
z38VL=9MRJatF)bpKXPsdececzG$|tN^qPHO&CFLxmxO|nY~d*Jb;y-(9nF4-KHjTx
z+F)soba;Q$&$e#@4CXF88^hP)qC1I(F`C(qcrr0)m5!x980fw4zDpi7|GtY)+njfJ
zb4wxc+uy4GgzfcGa|{xS<dDUSL~hlE=HG9Cckdjn75P}a^%<&)DpGwp>oGiNvrNIo
z>%a*Ha3r~)c$~DReTmGIzcyWqCg8YZ0xuI4X&MuJsTYFfwp%DSB4gUz)*H5M%s8Bp
z+0tj2-OiYHe238Gi8)|gvd3~JGH}Ry@lSTovksZ&d;y<(xJ|PpeZ{`67vkNd#Z-zF
z81l+fj2au5(`^QmKxMO5+JHbQ1XbnY<8ICt=gA<FD;@w`d^q*59R=Pyl~NEre*&@<
zn1jhWr-z{M*1i*M>4fZrO3V}C96f2>7Nmw291s#(cF}X=MAs{ht0-QvPUbJusROSm
zh0o&JLuH+p^cpnPM-H6ZBZ$4!M6Iyknn4s!B10?&zslFtE8k+7)0SLGLiSq`KdHKy
z1W;>Ya>Mw4lgZ2wg3B}*PRC&)`1mYWIwtEZ@%6Z;+?UeKI4@jA^Z+TIw!CK~tA(0X
zA&C8`jz^|z3uj>kH>tK!bMS;sQ9#T~DdP5V=Gs9mZNrm=;~T|Iw_oNPO!b1YStNDM
z#DDd5wmgGfXK9)dGdFBEJ#cDb9+(R)lgW=E!)ta19xz{>3)Fke@vDAn<dZQbl%e>v
z9L#%v4ABW~7!}r-E4L%dReNr+2#}NonO~%8Z$I4ih&{8Z{Sd94BcobkOIEN($RH0o
z<57A+Nz*;#9Wqd-rPclOHaaF<Z7^@Nxc9U*B5{quV)TAmLEQsLYNPqT-F>>|c@}ZM
zQihVTz@BQBJy+MxpddFbz?Cqsk$3XG>jr5#l~b@}5a49ClFh{KSzmc08W{sS!OMbh
zh`{hDkM6@{!U1dRLs|)kMZ}thJQ16a{v+K7`PkhVR1q^^l8xIMidAhy8O!91Od{(}
zbX``4%o!YI_+l6XjlY5dqse);FOfazyB<_VMDDX8uOT~zfd!5}DT7RDEr*j!0(T8T
zn$xz2^8Ls4MR8uZ0AHt)%7=574uiETC6Vg;?-D;?e6B?S4JRFms<_G`QcLqpS%2T7
zKVlB2Rx>bwEUVRNTMfk|fun_E7v#8kU4c9^TOoqumEE31Qcv^Qwbs0o{auG2CR!6~
zy6Idthv2tq+vDMZ&XwE>uR6!~q)lAbm5$Tt@8<k7z3zwxeNFr|QANjV`Ju%e>mJ|l
zk_W$NmvzGUSZlLPA!mAB5hXI}>)N8JA0bbzaR#|W@X>m0&ALykG%Snq<9~l3XatiT
zkP$L~puhqS*>1O>`&p^Tlvw~v27JRrinEoO{=OGCc*|(O@0;W9`<E|M77KgXbf)OP
zV(aRn_;P)!QWq-~XM+=o!gL`PqiV~SPo?zsWJ(1b@4)iR%~;4+6t7_o^L=~?KKGai
z1)l@lLevuH%I&Y)GO+%TAEuo+Qm@?4KHgo#b~NR6w6b#^W~`zKPzZ&7Vx+`WPt`Zg
zAO3V=Qd9O9U71I1KZOx<*@|jNFq7d89i=|N<Wc$NNkc?>V7t%|%z6Vs&N=tdm4HRx
zvTeNe;hjYcgq6Y}Q*v<8^*)$!c&(iG9nzDwqLw7&c@^=6<_rUm;CrE{(UV;>wYQ-k
zy%nv3m|o8-baTL=8J90EJ;6jG(#*P#-Rf$Aw+Mlg0Lbmj0rt-XMIx9rpyIp2z90)|
zPqSRbN^nPC`hkxMjLupaJt$eb|AH&I3JnY(w><caGzV@m>1Y668L4q-5*dX&GPB)<
z36#PwX2)AwzYFa5t3&DwLPkL_y3%;_r)5*ROgf^+6^eR%g*1-$f6+>Tp!lS3(eiMm
zqOj6SBYv&5e!|d~z@UgoRxRUg;A@Ry`uH(@Z^XCt-2w0mntpXfx@@veSH0R*regtm
ze5<0Q>qyWgET%N$*f(`6EpIFeKfIJOmk9UMmVPgS!3&Jq{!nSY_*12MaPaGQs@?Zt
zzvt@JH;P6}VG4VtC~TZ>V>pJb@G};>au6=}wAN|dK0moMVFJ`dQI(=RQ+V*1%eiYK
zduLFK1Q<iwadFrAxw$Pze=((-`y=+qEortKF6Nh<yr>1)1`wZaS%$RK_*6Tm*2c5|
zGEFGbl}!&!O9@y21Vs6cT~kShB<q&p4VIcz2#Dq>DGuj{MKGFC$q0ztsQED~Pe;&C
z<Y#(DF$Zv3BT>nO-{Y6fCQ&eHR*t-hbkY5R{j#VI#wD*4@<|z~k78z_Sh_^m_z;f#
zyVY<;xy(QjsVg1I#)&@9iDuZJ7R||jvS^;y7hQnim)=&e*xA=<HLdG=4f#B3<sRf5
ztS!WstU4N=_&pk5MOj=LI|nF)o|Vs&l2WEE)G_o`@_AMt<iHdC)ZI3bpomehEiD5z
zRcJ9{vZxlqZP&>SqyhT0jm@-Fad50&#wfWMAUReXF|X)Xx0&lhKf_M;`~3%}sc9Iy
z?Z-fbG(h3jz9t;@kkqfZ$Lkw}=b4o)_^|_Nb&~Cx9J=PV*F!Rstd~@h*!;Rjdw7Nh
znM6vgr{9OiY(2j_mmpC2@!irV#AO$jrIks>S_;yQE~r>m+P=T;L=ullTfMkGq3g=s
z>S>BbC6z?t>erZeRinRYLZ4aPbb`-6-~9TGYNtF_XeQThNl(gjUhGo#{`yId$t4HK
zF(C@?I+9<f5hiU!gANp9R_WajYEP$}T6TrGCD}UUGkH})OjM_;*;}8-{zH(NNI-gU
z1~)Dn8Le~>qrU1j&0B+xXJNMD2!mKYGZTa;$?(;w0;z#+9fGfDb9x^V1#47)t}}r(
z^tJI5hlmpfN6nN|uAnv81Ta1}wS3Vyly}T@h_U{6J?1(q!EL*in{{=8sF{>D;Zn`4
zs_{SRF%|Q#z9~=^QSw?z&x;c_u{B6Y_|L(UmYFz#SWNpkEKkYbp>uaV%Wk5_F}lh2
zP9Ag}aXC~hj$4vT4dih3YNl1XK<UoqF?E<ibbtL)7;XiQ%x0qUIq7W2(6LyVJM0~M
zJi<fjqCa<+i5JP04``VbI#+(yYA#g^B5`^33Z^VtH1Fbk=1p7lQ~WXw*n#+RbM$|W
zm|lo6=7LX3%t+9ie^O$)y=VACi5c@ye|fNJHBp(Y!zI*ha?ZbDGt4nDbMWQ#43FWV
zOWiufoLiv15!ie{U|ean=NyjOmdpgxjr+oYSc1xk4m&pZ5_wcRZhy}T;JBs!WQk!R
z=9C^44N?s;9njsPlwglfR^3V-sGyPf*273?_T7{`z7O+rka!NgsyNOhv}D=G4z#Uz
zo#Dwc3~h!k*?9{er#lVLT4Z`RS!6a&>w6wHmeC{8;0}Qa4B5zZLj_?UNTAt_Ede*J
ztxKR^{^+jJA4Y1!<L|X9EV{=+rJ_70gSufDF&Qz#;W@2IH7^i%TPsApoWB|Va<5$-
zR1kbz_0h!b{DyIdhcD%Ch=t=i2faA9JL~C~rW0Vh>kl205fRXI>t=4jj_|=hcrY3T
zxh_WEpN)Qt9ZWCl3Gp<dIOJp<wj}s=5ihv?i9Z1%DZ)swLB_ad+_~T6@*IBwBcQ>r
z)1%OKg(PWOpyjg3Mx_!^ZK|`VW9Xmj-ss!2Y_NDm#!-M{HJRx_Uzl}{Xh@FnZbz$~
zvUz4*P?u1K=3Tg$U#K;UlVBaUzksy57%p>FYtG<C9%+i{z*?R%joS<3E@E5~^`U!W
z2y*n{yL#zgBfX5~E?S^vg6aJfPSqRm6E9Mc)T2+{+*<Fhg!6$=*4)#Rg@<{No89_`
z!5nP!mGIi+BBxajD_$^Zy3=?BhtsA@ezw6fH~CmRk;dXC7}^wua!&q>;M_j*dQlo<
z6duhte)xo&1ihWLY$x1K31O&y*yyIEaMo_F|K7F$j1fY1w$39;N*1(D>yZyD^Q+Rb
z)A5P`q0y&@;e;8<qL|j#Avpd^#T<_N)c}!2ZAkOY=R39y4%A4DZD=QSB4R+-;&xSl
zFSoEgd5a>=)$$kzT);ror@((Of&SwKE_2#C;SwtH1yd*vLAw;~bG4f{0>cB!g1%<l
z886&1u<4K2=UXs)RbQ#-v-5nZ`}F(m%4?jrJDUwIYaw~D&sA8-_GS0+cF+j@alpb{
z3CNIfw56Ai9R<wq$qXLH_@~h?J1<X!wZ1wbB4Bfo@{V`hpu~$9_>lRK0o@BTG~=R2
z<$X1_=q@5VU?$zj&J}mt(N6ROPV^(rn=YGSF55X>R+hz+Yzepq+qZGzUfcCS`bARc
z?%D)c9ma^Oc_`yZMsLAccS|6PRPOptC!?P)`d^c#lUR0xUzV_g_)9K7m0y3fW3$GW
zhux%hovEF}*q#^F572-Jl+1lJT&lGymLWeODZ4#cyk1H&xX^>gc3nclr5$ltzTPH5
zcYnsOo8`KJ4pLlF2`nFVUUVw2Uw^UPbmKE}Bq4m7AL!W#tGjQTD$e5SoBK9@aDD_(
zY#LZX`ybLYoIsY{Yw{4q!E|#$jz=%8R+S4{vN*!NdUU$Q6_jH|#(UiF!Y2_JmT2Fg
zt6F7mF&limGy55AJ&$C^!RxEsr<h@3d^n4>Dj(L7e32EDZw%u?EeycG7LE~82Milw
z+kI{E*e(Mvs7Eub7AE~X8nHQh0q(uMy|6O;hUZaRu<ys<dNW>9W!6<+FG8H_{TFWf
zmaPE<`dx2S?FoG_39uElv*2!3g4U?W!e7SNpH+LCHz?JN1+3pFwiWfXyC+>O`B$$l
z2`CF5t#eZSvZNXG9!Ry^9zuOXp}Evvg9Y0Bn00Z=i%fIjxYUUGd~7dq$&i04af+e!
zV+g}5j1h-zf&5Eko3|dj8q4I5L-L_G^aRV<AxKv87x|udQ-iHXq0rjsaQ)3xyM@QG
zo!BG~lHPrlZ+*l3ld>G?zUbJBzZ#-GhV_FiNL?e3oq}_n+FAFkNS(XUm|Mqo>RHpF
zcLi0Tp&Fwmj>+tYaL*-P-)$S=g^SZ}ewGI#U$Fr;;8YUQw;(^AN>q0WO+rrvUUv)x
z{t_2m-fKzW(+?!>Pp6NW&fD41>TL&yeoS0n`K_;Y+?6~LhrXlL(`noMu&Z&?Rnk$1
zY>D=TTd*KqWb(RB{MP%cCF&+QDvlN$)ey(&hm~lQ;Ww$w1MjNV&xbed4)ywr&sP&h
zW!#Hko+$|oM5+`>p^>EN{8OE^zZ;FHHzT=GcYBfg{z5!YUTtgP&Hk~ACdF|}w#0jx
zZNf?viaV5UlfR4NvvIQ6-f4AyWn=qzfhzMKE8F8FDX7$)Vi@z}nf`NAy&?=JE$z^C
zUq018*VF&=XMbKgQ4`}dYO|off55+g_{57m0)g-@Gh{~m$Ey0Zv}e)b4lxN>|GBAt
zjRyFlHLw4}-dhFL)va&73GNU`a19y=?hYYXa0u=m+}%9{w*Y~KySux)yDfa-?s{hS
z>HY8B-Su^ys&ms9UB$(^SheQhoi@fhe$Vrid(!?#dH!=KZ!v#50ORfBRuuc^r25b}
z;6S05trvw7zW*>9(lKDXxDW|Evj5+O$@(ah%wn7x@V~3h0s;lE?U~+p0}mPj^nX6$
z2Pkx-F+!WDKSy0b$dYSGcf;lFL)vCjJp_(xKmXxB@rC&q^CpD*P?Oi|!I7ekH!t}g
zzqpwI)aCqF3McvhSW+Nc{=fbC4Nw9n03dAhQvdI-{JSVXb^Z^l3x-`^4?rWTw%zag
zK4oiu>ms*3TPJfnJa$P9q}QrZyYYFhz2vPtP33oyZF_wQ>3K;~J-*M%$}$?d+JBwr
z+e!&3U+NCVwfiC^_jAysqF7>mUe_l9?EN@;iPrD)r10|cXfw{lCkge64uzPR8dVHg
zdjK-+O^%x?vK|3bsqO1eN;?zv;9w_hEtr?X@^+(}RPKxEP%OK5L|X_#>&@ii+oRkL
z(5VpuEkHo(z5O`xIFst(_Uy!HfEV&fOJdTHZ}D~HCW}su%-O1lONQ3B^)A6z1V^jw
zVfpR5^DSYg3i-{(Kl<MHJ2>19yV4z+Pub&7-RS)=&Ixoedbf?S4Q^IO7!tO9JphHK
zgU@cou+M55lc+;X9JZMQ<Olj>epD>-gdI%gEf2?iHZQph-RK8WU5o&*<=rFAvdc-d
z!1Dv|{;A-P3eP6DuuI;|(<i_vvk&({uZ~c$%wQ)v?y;Z5M`D=6YA5q3YQRwR*HD~t
z+C%%NP%>(sG-&Tymvqkx%Vdal=HAt@pgiEs<%fmVx8EIt&f8vo6KO--wseb(?E_S}
z;_BjQ=bc)GZOlpv*%qY_fxy#5sO?ssjAFq8RxH0N@!%iU#P^KaDigld1_VE7+Apwa
zDDX%;)ebnVKgxTOUGBDfNj}}A#eA1thU{=`y!#n%FHkl>uiYfg+hGz&xVlomD+O#h
z8+Z3bEL8{+T{zi&I9q81=W0gjKpCoPNqEkW+r{kO{WL5;;MMlHx?+<J$Gt8pd9^k4
zq=3YFXrZq1E!3}%dAvBbN?{H{+lj41l=4ShUbo}~d(~4zDYIg-`z4%b^M56Lro2b`
z@LTk>mE>#^jgXH<PPv%O4H=t*M{OE=yq9Hxh}(UpJd*tg*!fEZH921rz#U7_RY1R|
zSC|UCG{txVj5k4-d$_j4{pp_OS3QL0)NXe%;t^>LkHd^0(gryIp@{%d`fPl(*Uf{k
zSrrh{JF3V0C4he<z`<aRCFloZEUndWr~T>aD<Ze+UNqV12}wsIS+~w_>huy)l9px(
ze2xM}!@h__yy`WH?>1&r@n#n{{tK`oS?(P$PfTZ5g*wG`%)usu7eZ>Ejsbs7%gHXE
z)uIno7LOc*7ah=w<-+9LZWrIx&M9^~Eapn%+Mwaw1vZIXR>*XHu5%st7dit2?CK2@
z%c>0Lank0`DG<N%_{9|+;dp*-_pGuLDQWkIMM@G6$kzv%;!9kc%MGhGI}e=@E$8-N
zMM(kHg&^XK+~Gt5!>0!3@*%%gk8P!h!*Rjacx-yjWG-6OB3wG_m13XtwcV9xyeV;>
zFL{D*G4!}kpE2jkmPvuQ-B`D3aI~k}r7OJbx~aM#^%+jQZycofka9hT#Vj=GO$)nt
zI59m|e^lJFhqKf$Bs~tu9UlB$s>$efz-l`FZEJUY2vm3)Po&8?EPgmYlqmxKcIbQz
zYfRQtcQ5c_>-Aat@ZL0q_q>y?<r;asp9fw0sE@#gP2as(M0OUxmO|v=2WTX{emkty
zeHbvaI_NQTi6r8sVn3XV3%*W|@oIJ%#90=M6L%2!IL>DI`Rr?8$L}iZZXWeU$56th
zcGcgXbtq2TUfWuVI6lyIUQNoM7;bHHoMTDI_x02sB+OT8Yr1)?rAiGLN+1Z7yj?mQ
zyxz;cFqmhEWfe!m<Mr;M)PL25pE#V;7CcLJyD%#kCEJ3Xs*qIL(uicsgs+dhk;uk*
zyGDr+a@K(6zZMz6^JF~M(zxqcv4L(oOk5NM)uM!<8X^md7_JJHeg#Z$eH%$yzE5;B
z0|u@6i8F_(t!v<$u+?UnrIl_6@0WX1toG%AK5*^LJ#+X6&$D`i9a=w5s;Ll1f%&4^
z1`pGxUc;!xyS{dgNj^dM#PwY0x*tNC!st@hOHLzlD2;t?u5Fzj7`HvVUqqgJ2IFEG
zw0Zt60^ieXE$9$Ch7r%XfOP@iHk4>f_3FPb`+;ab=<Rs>#ftMRRfUP4n0CH^sp$~c
zvq}`n>*Se_!wKJI%|P?C$w+*@#S9z2t*Z^Y`Fuh`u?oR>=*OkTH~TLJ=yBpB=+7Z8
zQ5I(}f6RAIZi=ez4R#5tM_V!SRKeb|qQEl33E75eTW(t!;R4s4{2gOU+sZvL_;Njy
zQ8ML*Xw*&XFactTAN@L^8G4F)k10(f8P<WQ%*1Ls_BolwB>dX+!-!#jRx7fFN;wsP
zo5ju)P0USrAq+7LOlE_n4gka%X=i9?loR|JkG+g~I5F}0v7KY+q_lnolkVh^)o5a@
z7<MX0K|~UOKtlt7m83o5S4D1{L&W+QU&swD*p9gAhZzx3zy-+~A<&BI*ocP7Ci-2f
z#z3-nrm{!YfHU*}+%o-5#McSSamGP&Ar08KU!Q&Tr&RoBiEx|XTao5JMvm9n)F<jp
zVy8v;9hkb#X6W?sE66QB2*eWr+4A8J=!6uI@pKo4>tD-l(?UEPjz$$u!(6i@hwPTy
z0^2SmJdBQ_d{|TLLgK3necTLc_1*z9C70pcU^Q`7B3|#$Dtr@Gwl>QGz9HsZ{vKp<
zLO7kzk40@}*ILDfLi4q5mpkvscQlD6->%}Ai9JHOb<($|30R?!-8>o08m{^`bsnfQ
zC6Nl$!#ra9@9-BNlF2@vv|QCGTg>$sgj;lUr&soZWSh~SEeE8!rQzAr@W7Z$U$CfT
zSx#Y3O?nC@IT{w-na_r>Za{Yy7ClYEz9csf@?f)yIyv8E7(NsgTp})sYxN5MGBnvx
znP{*En+>h;w?vk{M(oi)$+Ftp^ty{J#$n9FXap>3C#?|TEjAWcAs_W>>(&nPcNYN9
z38*yJ$dg%(f8|;Tn^s?BA3Ul9M%Z(1dtQUFl&r#lltg~6Op8n+Oi(yYE5kW(UblJX
z#jna9u=1Pw7LfobLz;6W<1~PLN5iG;IxDZc=2K~c+sRb!h_|dhZ7s2!F4nUo+39a@
zQSzdK?MZyhAeB9mq{4VuAqV1%g(4n}FHMIAlYOo)t&cZ@hWD3)PRx*4YB>fVG?s~E
z;FMv0T@9!2GU^7@H1P4cgSSAq#v)&q#e4Is(m?79z5AGd^-}R$!wotO^;)xgf5eb8
zP?Hnur{kGS5Z~2$z(vni)Nsd(7v;0pt((AGcg75a*>Y9YhpPiJ-dT_07;FJsAF&;c
zd|=nrt#l-bNAzH}%&Mhp&gO37G8KY)ZFFO~2-DZCTs+SuJYUF+(aL&%x#JRfzg#)q
zZgT7nxNntjkCX+IvJcf9E~qcnnN#vNyTnnFd`5K}h>oy1_c)cDA0x{3Y-H(^A^3LE
z{wj8hNk)Va(uL*E-DBHUPwNzc&lX*0r7;WylNq11++XPZ{>-9(gxjb@u(dIuAAufA
zuc-p5TDESMQ8%9DjzQz?kG+tsEs{${I>u(VCAGn!$FwSWen_!_x#x;fG1?M~VgcXQ
z7uBTmyQlymne@+(H_QuZg_`~68UFTgHlmKS;kYkV+~`4gLpVLVVWJ14E3KNUh1$h=
z(&^m8Ev8tc;XdhBVEvZPv*+(Lx??EyW##;y7%Pux$`}XxHYmpdPX%!4h;cfJ;MY6t
zU=)14i+I(vZgwuPbbEldt|u8F@s6!B9>S&C;NZZ(p|nTrApL1i5W&HQT`3CP`3<lW
zm42Sce|;0D>U^}4tMe0@0zw{@<QZ%$XHMKJ|A8&(<``^)->_#JKrMJ6t*hQ<kUQ8c
z&-aPKHW%h&VQT$#pr%iUBficKFMbj(u||o+JG_?*rh5(4cR#{|Y4fnO+UGHQ)0wXI
zcEgilE@ix4pL@j!H+k}=F*BIqh5!5!SpJemqw+mZF8MQhVVb}GR0D(B>Ras!h0EP4
zNO8GQ^ufHGpt9(a^WKlm)+rz0se&7Qz-ex<%Pvlo%xR;uUj*`F`UQi9+mFxbF&BQZ
zcW>Y_J9%IMdC;8F9Ojm&XgKCY%`t=w#`W((ooskI)EHttgd}zx7U)l#<hA;lnx9GY
zC+CP6^Rgfr)AL~4v@iz^;@sx6s-lKG8;af=jO*;KS!vc!#o!P{ki%T1%Ns{K56Dh$
zT#a*>87!e+uDNT5l-{r<Y7#*!3D&y5xS6d~__gpjr^b6!#lEN0nAxKRlROGH-5wDg
zI(N^mBXqLn%K4<%OclCY_J8J)o6H+bXH0M5JECN5H-%=;(}0BNfxpnWjUTv61bDgl
zU2$-+klL)cFVnM__t*|*5a~}qPu?IXXQW4{1^^~Gt45o?`Rcv2$`KBo6P?hxMcyRj
zx<p&9xo*q|zp0OS&*$e5C|mzNraoQNs4i2-do*?X&MqL94~~rsa-O!A;4i}!>zwzc
z*fa%*UJ0y`IlAo0TMN6rA+Ij?EI#IToLlpn*B&brw>RQ^)I;Y6oJzUTiBN*A9ly6!
z+h%f`z7lcUe~;XYx8Cy<!gXZnm>wT`;;xo?oP2S%VfHUmP4}LvSC#8`nDinh8kntZ
zh1JG{Yv<QJW;cF2<Xt00c6~6z(I?^A?JC<ct!yzrH0=&b(0lj+z@2t>DKH3H4Yp*o
zna}&;ADrYK9<$+4hz+LlTJ?A9Rq`sQidrdl8&p^-BSRkf%mXJ*PlmoCW5i*DEILCa
zLM&1YHAqto0WV+H_F18gKn2)gLZy0Bhmsps)GQvC!;Mt*8i$)BZU@kq-wupIKr(lo
z`{+^uAnHhdzCUf*N}c~g`7Po5xPz=-6v!JIPE5Jzp$02UvU*pcblHii-LCz!dNnVE
zco4+-?E}N*vOqg_@NPZBcY~<{*kr7>!D{-fG_TBr%VmrHzI^K)S*GJkPCNgognd>M
zv7Jxy6Hng*BDmWDR^NlC$TYo8K`x$l>!}azLSvDdu9pXUFT|bl4_7Tq)CU+XM_2t4
zL&^L8q;ss)TRrY|Ue$n~&t`Q4f*fFTaew`0pnsam1cb_ln$KruUuq_l6@2WK`dX+O
z&TVoT^qgml0r5KB`nq0iczKrS(Z=GYuKfzaIKb1}RI~viCtG2q1%%ZB5C0tX?}hd|
z+0oW?ofoNNAmcQJ)}bRxg@XwF&Ca(e#1IM>OiDlVTJsdojBeGeJlXVj$~*f9yY*vE
zTYQWc6m58qAvQb3tsbDu1!0ihxM3LJCDeH0JJ3vX3z%T+2#k1PHhhGNQ5;)#?<y69
zp#z`O*P4-(9=@3ZWVKltTS7L-%J^*7m_arxzwjNi<ntS0-J|ipjb!f7sV1M|xiftw
z_^Gh$;z<lkME&<EfnnU?2Ms@3dk61@(C?^$4O!SZxhqb0s*(QQnH448cumG9Me`6J
zsmN@mM4l2yaeVgkVC7o$l2>iz1_U=%S(=8%fDsSU%N#@6b;<Dgq2znNKb;REoj5Hi
zL-KkzUeY%a2Dv8sXw9yE@c8}TUI1!I7dVgRxK3lsRV#JqsaW35(c`wQ&0duN=;wf9
zR>jk|<a7usj!DDOsJHW$OJUx^TqSfuIe3duH*r|X5ak8BoNOcpBTi+k`R%Gz*pS7y
zK<2KGnvrwrulK=%zfXd`u4~qWg96=;gajRYEY?F0tq3qqk=m_Ndc%Y(`1ApzfI6{^
z?WIEzul&v>0rLj-@!euZm?lBBOv}gP5j<*yHj%xlP|=dow?`{1Zc8<4;9vak+KAxr
za?<f1`LW!5<|As|_sL=r*(M{YA7%k_mP?PD)W%)DB{YO6L3S<ijYsjv_gY@pY%Lyv
z4vD;DE*aNvU?BS9XCG>ckG<pt3stkQVBBzwtN5en+nhA9N%d$RXu8~1;US+S13GGB
z^cY*!Jz_{Js+xP7j+XTGP8`l#+g*HWOcgVsqPXjWQ&?9(UX(UzVXYaWB_VV3;G~Vp
zHvJ7ZF2lJJwGf<9A1MzBALf1}9NN9q?r8R><GI<5^(s_!@vF}a+D%&De$y~~pcB|%
z={`)w()R@wEqR<j%$S)d`dm4d$TRjXvgW}f5Ir)vF094Gf3o4Ep6vcX8G-ihMKYu-
znD!@QE!r)z>WWTaBk`!2dw6$5cX0oNFoL@QkocvKUhJ|;63cwuOZ*uE<6`h`;6o2x
z2NABwqo6AK%I*@c%?^+l%-MRnCtT~lCW69Qf05)ao<Ys@ojd=uN;-&I=IrL6)m#+S
z!=%9Z7B}h*VSi8Go~W1{8AX(=`_VY^XZf_{Y6zXfY5}w12=;4*JX}xx2rK+OqXMQc
zyks?1M6P}Buig7~I~|Qn(hI0O8DCYj3!}!V+}uhuDY5bgpw%*ZR(oBnI_Pzqi-n>T
zP!G5J{7Jhi=P73`637KffQv{O>T=l~cAD2IxkxNkBRdOE=4Y*`;#K$p5$PrH3sf6>
zJV4jk@jxfy_N#hn+#Mw}N&2=P+8@UKkmT?Y&E=wyljxwj>I3aJN)T&Hj`#e|P&`NX
zG%(Bem^DNjzw3Zb2EG5Fgx+8LNtY%Ya8mA_3i>5@9{8zWUarZB0_<4eH=2W!{k25z
z>zfzS&ds`cn5oLJ5A+|6zv&Of&Wefa)S8r@zv+q=VQf?x9z{854B5AwVWP|T;kCNb
z38(NsnseMrwrSXaqy95R`Hf2{`zQ>LU0>XYR6g=(5r&-oOAIhOP0@4M9f^xF2yRFZ
zPhNsbQ|1H{u8LLd3$K4KRQyI|6`&^R)DMDc?W1>PoY!)Q;0~l;f<2{IHwd<|0aq%W
z-A0WwdADQ(f++ZBYzFF6w|y%hL?*}`f*qzD@?rkUpuv<YRs=_~CidXwha5dG5=yu9
zWU!*4lsueqE%6Po1|ask67UdM;8=0N|J-If2AQAqJ#oALg~LeYu=d9^c%C}Y*@bI-
zlksSj*@}}_+ZXi`h@mflKgzv)x0biTd@w-&BTmRV;{0Ja%b1>TV!7VRFp|@3Wgy-S
zi&~C@cob5L(jyT=g-XD)7NT5ZrE?dG>2DTO@?y$xa}STc_i5T>&$GEYRpF0-aBBL8
zp@jLn>S62aJy7P_WkPxxRdFICt5sau0U~0neZfJ(K)UYHi{it+Bc^NH8ox&pm;ZkD
zjhN1DrGA6V6AVM$XUwx|gR-~Zo08Cn=mnq70`A>(qc=KEEXw9yj+tQ*k;v%GqkdPL
zjxD8|gh47UKd$={O%V<Jke82$n^Q9;hl(;}hU3IlT!9iD?PK@oW1C!)Vx?#lPjB-V
zl&<-8|E^`MaqF@^T{9#xt4q7odeTfWpZPSefdK~NmzIzA(N(a0gldw|x#(c;j>pUN
z?K<6W-biY>-z&F{YVDm+7h7`FFmc~BxgFIAUUQ|E(pTU}*I$UCVC>Ld36$*e8#b~g
zv8k1`d_Zxh=L8Xa^%PMGNvr)|R{Qz*cO3UBHCK#fSC?vMe3OQ`K~H~-#@aln<<lMk
zi$7^5msEL<lbM#BG?EjVhV-9^j^<#7x#>)O4KY-B=Yq?Y`^yLg{fC)BS!j&laKfgl
z%a5RgOb}XnN`c4`bAA@PC3zP-i*Y<sLV7HS*X5gEe->|Z4gAtbdyuCI^?jt1!wab&
z7gaRI@V(GOK=bFio#s-HGT<Gs0{6frl1Buq!UUNOu}~fMFdxvU_HHexPo8Z5-TGR_
zanO1gdFhj`O2k?>2HU+b``!mmMDrH5czlBCR;HY{G_!|Dy1z`P6_Aj;WT6o7NUW)L
ztC!-luhgG4N0IGN4vP`6RsK}|jcW)gtrv5lczvEv-%MdFbCQ`4ynRzicqFsZC+;x<
z#vVs?yW4;VnYm8o%H&^W<}!ow<~m~d8huu~byK;Vy?!mYK~GCOKaRJLX+;CRI#aSm
zkEz3>_0fk4s907)=Uyha^|st@<5D(K2`ss2K?0B*Sl)w5yh9fnL8yJxw)w2cKo|pz
z&vFZQE2mHPOZXdY(|T4UDP{6?&Kk@FH%&=Z+v#7k61(J)pl>QkA{WKkmb1luq6v9&
z6U#fee0oeZkVkwF29q|tt4p)OEbft((3#eNfGthyM)hEHB1}4jV3h73PrW$J{1KtR
z{MTZtIQ!vmLmb4#<b{utokX_`1Y$ht$CQgZ;<_Z5&pO?`a7>3XKaO&<DCTo?Tq#6<
zfz1ZR>d7C?<yLo;VMvppI+N6A$Ds~eZBp7g)&AnzzWFgGUQ2CTs~1EzGHb8mu^^yU
zxANK5?)U>P(k5+a-7Uf;1O<O${A;t`WR{dW)FkUqNso<MgibzE3=jSr#?&Ykj7nWn
zAMeW$uX-b|JF-iML~Ma11Z@8xp>?yVQU~g3z_Bx7a@eoSY9PZ;F-uxqy}`9|j6G2a
z3KWpZoRp#9>$dlsLAUUiw0ZTZ<~~f=mN}?>zi*FNW2D!A(3lKNbbn&LI@S}`XhNFg
zgK|mnWxdn4SFEClb?2iWGSJj{de=wnen~3d)_GSwUYFt-VaNm|mxAwS9+?<+?qa%4
z)|!CkrA5@5qHvI|XnX##dAyx!jXdqDgJW<?Fd?%;8Q*eA;WRQz`Jt}&{jB;b{R$VJ
zKotThdSdzaURVCv5`|Ssz5^u9;HOelMYg&UYDB0-ujT4Uv7HF!v?mVCWxZ!5$oHy?
zmnmbDvQH&6Qe=J&R2Y&wEsu+>LoS};OrMVyDn*J^3N0A|I`vat+89?w1HyRQ<ZqaL
z2wINcC9+=;@jQXH!}gPUe!@#AU2F7_>~{EvT=b`V({>(2TB?(9To&jDDX@JeOVZ>m
z>zy0_$(eYdeOBQIskxp2mxXtrjUP~|eYd-di7^r8E1owX>#|yQoMiR7k>5niGfbpc
z9@^WnVO0CED|LARQ)RYVbldOmt|_><J!1San4fa(CF*g#<4KY4S#>|HHvY7D1G=$y
z+slL>eJNYjErRT{nvlS~?ZNw{=0$Qj*GY%r|K#6me$x18&6@8?bAA`R*yL0!QT%&M
z|MOU!5}4$IXNf{ig@#0Gp1aJ!Z7mBfNxm(pVa%hkrW+e%oFwwT3AYW|RZE*5v2T4V
zO7MIHOO;OBZ?hHOiewue_pUSs*O-J4<VB<fmHP=Z(sVoxFl^28^b<<dXmn37n}5h7
z(Fvi^<=AfyNn|zZEmAf~`D9q5B*}RF5TM5v`e7s=A)68<6dsM3ja+msmA!lNcpWjU
z?(vd7tv2y)l}d_g$;=KEj?c?6cW=CO^R!jo9>Hw)Fp^wlc~7g(^qN%(=pw?Y_;!Uy
zo>Zx-eG1cXk2CCamVf*hho2S?SnXd4e2qo%qjMPiB`&?%<jCS;kzZV?@2V0K2^tZP
z5aq@6@7Q_#0LpPa2*)#FXK#3qPRKX&p0sVhyXQgvYXy>}=`op?G&PrOB10)+@L=t{
z)t&rbs|nLw4)EbQ0kJLa$((X|Kdt?lj#=(`R!3JCThcna%L|kVdl^a1{LV1rEiR*E
z&#BWGxb=ysSfjB@T^ddVrNrQvQOlFWgGs99+`2|qpwuC908^X&o%M#Q(nMImU>DHT
zFa0oVZb*6pM2GLc>pHXl#^(3ANs&I@6s4EXiI$BLKjIzR7p5z?Q!K@N75Zd8tET;A
z5fSKO+AL-jQGU41Z%1d@8;qz4%_nVAs@;}`*mVUtw;dBCD_GrOXKhZ_ACpg5KL~T}
z)7+_fU#<$2(^(sI0uG_<2QvvcOJC9B5U3e~Bc!4BGCIGm@!aMW;>eq{=b=)UV5$k^
z)i+uKaxta}Aml~3O*{lPuhQ3Ev(B*wax|@6gy0U!mrl2)niJ#Ca=5`gJ@;oQ@7ZXg
zrD??^XDz3=`Yx`>eLBpdcCXWHB9nh2EPfW&`Vd_ECh+w6j8}7R4^0AOG5q;YAU*zX
zm5OyWG!PH8CT<kaAJUgK9*bWr;Xwj1%CEkaGfK5t%mWpwt{a23aYRDt&Q%3fr_eZO
zBA($<S&ZLfV|Hg5sr7tLdOy*G(f8~3fmf(I29JHV$><K2HU&DS`HBt?2mTd%k#gMF
zY7>EX67k2R92I!DpLn|^eQ%@%VQfh0J~-WRr!|6o_e|#+ryMtu!Xn(H8x?D8--+w?
z7gf+CihTHt^mAho#5T%Pp2JM+JoyY*!)pI_No!?1NCY;~k{)0*Ybc9ULT^gCKB5Bq
zR5&ey+x?%49Cql{)Gl>vV%QeyG3MRSL8TMDOoZ>lH1uMsis^Mw{C7f&K^)+_a<OH2
zw0GL<>B6^t21En5PKIrUxOkOcxFUt;O0-Q?JnO==4cJ{uoYN<As~J3-M0t_V2$R0S
z^$C4833!6*G`~>mCAvuBxz#%_0pyqX+C<>tGgYx~7TO(SWpk<2stl}FzsD$l<Uw`s
z34LC={w@ta1HFXC<FnD?crY~@kJn%tGNvv4bYg}v@;2QkPB(>=x6U&1I)qeB+kL19
zk;<_X3br<TAFFza7dtnwY@yOY)uBu+rpv?*cY*NX=FlcjzvEe<9tHm*_LHH!KyX$R
zTys<-*()U;6yh}NyIq%rz1PW8sIn%tx43jl0_CV4Y&XqC@WcW~fgja2jx*t_3t}rJ
z%O*1^YMt)B9zR?gQlg6(NF~tdhr@EcRn;pYwG4SPTc+HNLQ<w)pT<hTtpZa<veh3E
zg9d8)qr7d48@}6mAB+e0c4sTMGhn1Bt*%0RtFnw1fyon&2l8vp0mtD?54J=}FrB5i
z)eW|89Q!b+iNSWPpAdPm6buqJC~E~=VT=RIs(vE9p{w+Bd3GN!@N)(sZg?;fqFk$u
z&_7x0xRwPSO@HXGJordaj|Pj!YM!&jy?vWS;fx`&p%ALy+p)&46F=9K<woPrja=!D
z7i`|8dDdu}lJcQfcP;@zKQ*w@?x{FCSI-Y>a`Ju<XE071Yp9d~5n8SSugTx!e9AZ?
z6CONjvDH8LQz3iAJhI`<eX+>M6V1zob;T1brP0d@cN2>tI_vwlc3E1u#kUz0L{QL=
z*Q~@_YD6pFUIkh~`ix18=2bwu_m=@d5e=A0*J_P#vipelK;F!?M5yz4*(W0u4!rUU
z{WsU8TPNK+{vCPWPZSvQFdy?$Z<XZqD1NHmuZbO&kT}lDfJ)Z%=zxqRjxvk=_tb(x
z;gc=}<c?}HWZAPne+rIhh8Wh36}>nrFXjeIq1Xfq-S%p!COM#QD!b?j?3=|=@0$<t
z=bB&t*<I4nxXr3&E$gZG?%TtTS87U%*dGveuhj6Qx#r)U_C6e4oXTAeqiU5P$K7%x
z``3I^j8ARxeYT@{FjEiS26F`B?$8XZ-U!KoX^(!$;^ytJ2>dV|xi~=x)E<ReB_7d;
z3RNTtofVIx<JEQD!MlPmO&G<nW8*PrV9R33ZALw9GNFphb<AZ(5Ak{9G3itK2;#h;
zp!$lFiZsz1L-?uC+d#6&IVCI|#ir<Nd<pnJXmly3Ou%v)xN)7=F5}c5QP;h3V;^3+
zV5?L@9!{5EYrR>WMhoI08g+g9(LL$j!N`o0lLk|py>CQrS}%Gi?H-z=#EYJ_ZKm+o
z{{XX45A_V!W4Bvu<MHzLG>#_D$n28pY<s4}JO-qlk}oLQgkliD#%sC-$5r*vo#SK=
zb6X&M?MIZ}wSy;kZ$xN$)LqQS<*I;i3R06c$Fx#w1)iLoWlpz%L=vT%7H3Y6bY%QW
zm~NJ`aUcZYOze=Z142EJ<wMDsg?5`icQ6_WVK?WoT$h_^0nXaQr?IBPgY&r<IGsMa
zL<YM&i6nNfq70=bvoJ{N@`g6L18O_kzqW3u?N2D&p$2DKIW~Nb%jSKd2w~Ow!tp4}
zf;kOol0A|t51q+Q4U(2BHp-JSYZDw<lo8x%71TyjIYfsN=`GxE@<x+BrEprYA#>~%
z$$GE$zQft&zxq?K1Y)|v(4IK+3y#*qrHW_c4eawOeI(p9PG*0Ux!sq#DgIuz{k|$O
zHM|@8Y~^dzH&V$&j?x=NaR-2lz5lA(!9tw6@xJF}6N#GM)Cn!Ix`;Gy_DbOiOGqGq
zpp^(}NAUYmwrhI9$ZGfgIVe?OPCn~Pg6*1d1((wfd!ytBTV5CnwLAH-Wzk$Hui?y|
zPNo={^^wJD`^_H2LD$!3?_o<mq6=aQ`L+J~x6tsr$z8VeNQgb0eiDojU<!#M^(b}7
zd}}L+(%$1w><8m6fy^!ei+R?SuAMQ)NyvjJ4{n4QJBJ{uxU9QPaGP8Utqe<nMFMbx
zX?aN+9(5L64)tCROanXl&-ql~r<KKba6pmN(c9yR;C<ZAOFjG$A@>TKm7iE^4w-uk
zN$n*{s@Et>wbOf}I)oVIvmTj=hPXMZjmWLj?w>X^bUV+q_!677(8L*q*Pv5`%+6YM
zY1%t4ut5%^vMC#Fa88Izh^|yVJSpuqonHJmUYBX7_wwF803TDLT`#+Iu+nj}s98XE
zK9Lzv<-CK0NNp2jFX)2H2-kTL$V=SplWQCl(GTW`ual!G>D!ZEYM!kKzJJb$TZ#S{
zlq@ywCATsRGF?$(7t-x_v1#|ZbwacwF%v8bWBYPypO1G+?9=8(qO-|aXc##f);1pK
z>BsUu>=8zz1kP^#u~Fkv#qfZYi=z=|(k>rs+*2_`0PHlcUAdOcD^2t8qvEk)Ys-6+
zBASUGi(f5!y~;tt4JPbXiN>>Hpu3u}hD}FQ-j~C(VbaxrzpU8t0ci|gF&0BL!=deb
zoiiLy2QETrh`P)vz1eny<X)&%r3D7-Wto_u+*Z}yy1h(eCjtBFy$2pvjNCd1?=LPp
z(O}ETw7K~bO8+lh*7@JKER^#04G-9g>vZZ?0}S3+TfTMoX*c_NB~EEZ6tIaSabHaV
zZ1#E*oGZvKU)f4QwZVLBO}%~Sn<(5lS?H=Z6rg1U!Roh!hpzARqKFCAw*}v=V3wSm
zd8O$+5T`*@=)<g0Ch#`qI-h38B~niLIlZ1L$MC!tKu2niE(M)K?&`qq0}wkw^5UAM
z&v%!rU*&q<Y*MR?J5WLMrQ+7)T1VP!t9dW;v5m&16(tnmuz3I>GAKKd6nt+lN~ro#
z=X@i%#nH?M>~EvdvU!V2uy+fPD8o-K*h%eQf{^wdS(is!w$J>70^{nTR>>#q{&n`-
zxyoxw&T#VivA-=-zgOW9u!P<xZ{uk^F6dxy>*FMp8t@0&t<;pdoz4B}<uKFb(Tx7w
zpJ2nfZ__qj&{VKDl8*_?{5H`<frW0mklm&0CK-lZTBs_tmzFEllB0fP5mCAigU?|q
z@}d)a0Sk|Qm9oIPS-mRIL9KF6jfnbBoc1rT*8$_dzraK#JJC2fF;Je@{qS*g{oUPQ
zTpymRgh6E}IggCEhW0?7?Z%gjGZA#kh>$2qzE-PqsXO@^X%tTL3?5ZlT-S2U8$=3Q
z05?5A{v~B-0ObRgCjLiwQ;WlIJJE}hk>bB#%Kznqgd)rjX=%ms6m+S7M0OM9eEWKa
zP5bF2{}El4e3!w@EZJ%;@cEwyBk&d<i!q(3{2xc-M<EIvh~m2GzT$sod4>KxINN(*
zye*#2EwTRygT}rG4iq<YxZC}YP;bW$04twgES`V+51{gYtj+(|!hAJOWHBBZrXKj;
zR3{We!g+nYvw-5F#8dj8kLU<PJdn!0miB2e2pK1~h4~!FVoqqD$_eG%YkBvNb^ZGB
zEt<nzz4OD>J`gl7&H9gD?4bkdvh{(}M(p3k{I9D1`$LQrZ~}nDb!qPZ`z!x>3I3m0
z7etmFcy!{}`Ze!<$>lE5Fyq&GUEX%A$FW5R$oAu@kHk@w+iV&mE$ugelz8zOWo>i2
zE8@5O`Hs5gDlo|su2~wY{rM)v6M)Ttly||mhC*pvJ};gfmv@mEy8wENiW_XKEX2lV
zmw-qSX#et!<lH`OqrCkkmQpHl2oOyT1DgSExsnKv52fi|R2Fk($d5ceI3-%GJY$;|
zIPLS5hW(L)=z@foNL>zpYl*?)hg@VMZ2F~<SQ=w@xjSU%E=tfH4mB^|O@~mNI#WEW
z<S*?C-S@MJW}zCUiR--=`$R2Ynkxc@n&IZD&l6cdFiOHza<#-xesTZD7l7N8%FY`d
zZTt(^9%|*hs<rC}SgzL58;}@2juB1e>-tbX1MBSHa-24H-qcqhAdh$)SA=Bbx5$uk
z38B6H>%`Mim5inv%_I4RT1?i3Q9}OqFu%u_Hi1ttPv)lr_j-#g<KjW***33!7i+QT
zskThABy+!b+j=<B`{%u&SE(~|spWaxo`w)od9JHx3c>PKyCamqCN=q@WAk3?t)Xa>
z!L0eG??;HARI1{)2NJ~L(7pCCcU*wxP;9xpJ{DDTuvv-I4!~L-^<c@WUq7C<OHE=@
z?yRBU!4iq6L=y6wM{YDc+APqKTVhzp(fCPxf|Orf-xU}8_bvkaZ&QU*oQvo842{4$
zqhj68+k1b?cuAd;Ty_(HUc<wQ1j%GKk=SFSIJ_*csyu^!NxXE4N-qXJiIM$%9}QFb
zR()<h(3~jF;JrW4K5?}>nQXbw9l)Zb0OH&)93%cfc@5BK?wN5s0W>cG5v@(<mz(IC
z;|b>9Iy-Zfmej0W%~6K!7t^z;Tp19j{k7=@NqLpE#QnEcDd-rEKh(LOR$UaU$5><U
zH!OG6?RN*MZN@;Raw+tierch1m9?0^+RVG{Q|=HcBn&rknr|frS$1GZ7U=;%@3rys
z0WwZIUC=eT007mx@hXHK5y|{vky+ERt(^QlK+GsCh~}<VG+$&Do~d5NhoPdu>vG7T
zIfKw*J&Uwh>y={Uw*IzoJ|iZT&yBXk71LoIpg|?UZ=v}~&LeqGqRZPK6q_86^tw75
zmMf)H7b_2u#}8d&VLpO{9>+jCV4s(Szc}iK&5aZgSdini>}<ccY%`jj>5#QwO`jus
ztWjli%aAN!TqT`!)zp&bqq3&nAn$&C%n1E(!wjD6Cn-zB%9s5H^d>AZ5c5miD^+b8
zkLNT&7RQnF^BAwx1e{zx<ATd8a2p(<m=d%*EjHR8h7NTy;O=#RmG~zt>H(|zRqP>3
zM2?ZzUgRB6{cfM;!_CPub!~bfWA3-#Ovc-3<1P}N{lsxR?kGBQ@BhA65E458TEnPk
zZ<59CiEFXlw7$mkg*)5DYO(1$wt&NW$!N?;arG#r{AS>0G+lz$s(Qv^Vc?t%T~x06
zT{bz)!-C=c#iEGGa=xmZcz)G`a>7so1%SHSkFw7vp>1#V`@|1wL@it{$a)_eK*pl=
z8Hbxt4~=%FDD8i|>Zn(_#<V=XJYF0$obM8!m6d=SLRVZ3krp2YsV@1TA8xT6U4i^(
z5iGcIDCk46;~q&b2xzme9%fqAkRQVbSgavVB{f*g%MOC*5-qjbRZf}?%hVY4OWjYL
zEaz3$(m1K-&I8Z3{&M8a9Bq25;^mz!_cl^M<}qy02N+_NGd{l%_!cP)v<*9=?Lu>Q
zqb+fg!BjVg(R;l#tG}QvtQgR!kkKOIaf)QWf)?4a6?9$fY&III*^k5to9B*8dh^=0
z(7K#6%3=EsMThYx7+dHqgHswww^}E>50Ds&>1w#?o^#o#QVPpYmicps`=vH?_kOZD
zP@hllkoZfD-p0p5o#x0;Op!7>;xwBx;vW9<$n{OH&234^L<W%MT8+XlYzD$_n9{he
zG(f2^@mTbHf%rn@lW%ci!6<~J`SHWS5tBjo12L3SzR<J7uBq<K8Bj1j%a1`Vyns}5
zx;Z3`v85ir_HgLib;rZWz`N;3c#73pe-YwJf0rm?;g<Hi(e#L&XY<S65l)j74>IZU
zwr9SOh$Yu&t4yKwwSi)c?aDm*G}c=&<D;?LyR=D<`8+Ri2jY469c~Z7?RLOUHQ6)V
z6Ok%DN92u&e3jAIXMWuyhTAoVs2o7ycu+KnaWT8rY7*v7?on4V)h1)n2y~xz&J9F!
z9YStc%r4$L5WfQCu_OQ&ep04Yj@7hj6lF0TqBy13BHik5G4BdgQfY0~^Xs@fZ;}$O
z<$SnLOy61aIAuOJ9ZFI-(k=R4`1fH)>L)=9vEqLxLDGGQoM*V$M7vPjXUjR`5gn7Q
z%l-&RP3rMR&mQ6nD;XGpE`c5b^Y%zGD^x35Q`Rib2EAo(F~b+!o`^en_%6B~Pu#WW
zYLj*|M+(P1bz$w0rn5v{?-;V~;J?kP<4@Tl_r=|3sSUd#vsR@Ynoa4&hIV5u_2Fga
z<B!X2xetzQEp9oQ5ZS!&K<3*7ea1m5D<-fbkov~3!Tk5OJEQP*{t~t;lPkTin736U
zT5Yl<86!C7sLkOIkeCu<;>0m|GWb9$_R43o>AVCWe0mUI+=rU338X9Juh?L|HZDjn
zzZ+#MqgAF&wm257AuE*)Iz}x*XSmlZY+0xcqmcS7Et@W2<uCAu)wIM`BI<)HQ5I1c
z6Q}uolAx~6U<z+NVZ?b>pef&!_W2<7UwSkdQdk5h^%GQNahvFoW7i#Ab@l9`3~rtm
zvkI1mpH>BRtVm+QVCFDae%l%9+f1W!I*`)ont1!18^@7R)_&3?HTK+&UUNDlify)4
z9DvW;?0i501dql8&(Yz1@K{m~h~Hxl7g<*UwnSzaN17|}bfz?g8(b*D>$cYkzZ2R|
zlIZvR_y>LEFxmakcGJ#WqFA@Q>}r2j#JT8Jg`6HC(=uy5Q}!8dXkR0l+3D9zvG%fS
zP$~E<6T)5oxS<7*tl{SvK}88p<|tn?;{j2w$H3{F!0VsdLu0-6wpi9?eE%9ItNF@m
z4qr{l@sm|)KfD7bLIUvS?OI$Ji1!;{L==w987}|&;-T<XjO%>~#`|?Hux5nRzYx|X
zCUA>!@d$`hmv|&rlq#Hv+{GMSn#*2oE5K+vxD6<9#)i+{a`%mV?3zmPLwr1)W}hoQ
z`TxYP+5`M*?BP}Oiwb|`SQ;<g-OFQf$cOFZCZ`-gFkwOxtmjI%>wcwx$NXgMfYN8c
z%b7ol6E+jp=uuy$*sLOxEa33lZJf#$_f=rIO@yBWKj}38%e}oxgVS?xgbYf<Es;52
zVSf9ud@)HB5$Bpg`|Ei#3Laa>_NQ{bke?-rm4{)B;RcF;rEE7$+G2c;@SE0A=o;yf
z6zasw4ic}Y(8ci>{^mtuF4qZ~;K4ZEk~$JEw*37XA$XMF_H_@yGU@YFt|t1ldcY~v
z+7pw&moN0i>$$7Na)SHC{M&d_`JD4juOKcv#gn<nHZn$3)?lcsk#4%!*qZ+rjSm%i
zz+KBpj10Q0E9+`YTyeV97Rwh&`ZHUc{A^L)I_LP(luz2>HbZGTp4wY>MZhJ&VY3P(
zghs^s6+M2@^>jMoazIn^ah(-q`fczh*+BHyz*%C*;QUf!c(6l(Vnv7HMdLwy`svcK
zmVXmx#DO@|Hqq;OU}OuI*_hj`{vK{4>nzd5Aa(km3fDD&jr<cMJ|~Z8Gk3Lr;&8HM
z1HaFhk@j=KZ4wcYd>?^*Br7xgBJRuI<<AC&(0#xKHr$u`41TB(0^LS2C&s-S>&_pT
zKZy<qrRA#U+XS5cm=(L6PAw86HS{99ccoM*k?#z=SLbygwH4@c)Up-wWsUGS2zPff
zPZ`sjfp-1Px#l??-z;aLiDLhJJuh{cCu0=uPMehtp9U_lAvFddA<Z7YvBSffGJAl!
zE8^w51@uD;+!TI%Hbe2Bp!0eTwK~E!H(-Lj=rt>~bO;1xi};m$SKbY&O>R;$Hu9pV
z?CA&P%!ug_fJin1X@61v!b$WvNiGuWxl)uxMbd$?>Eo<{xL*BKB}D)ISM)<{Gop}0
z?ZU7H8=MwyAR3m`u1G2Mt?2tFnQ|7so+r>{6ZM$z+u(Cd2f+AQ3mMCEv1&&#TZ4VN
z`rVOxYQ2c(#rBXDQoXe{5BGS*`Bs!uS!el|&^|;~U4B8O{I=<*U}`lQmZ#jz3!|V5
zsy!fzp5oN!%+K*;K_Fg=5z%3LK*0fpSk!tjqm|#8tL@QMaYfwt`K%j#IA60)E4Dt}
zVomy?7Z+*O(z<CM;CQqPccz9~E%*1HB?0TTWBl2ObJ&ga@8r6J_B&4QzCly@rs&ka
z3gf8p7!^FD2so_c0Za3w(+)%oBhztJ52pfm5jO?YSgV<3WES01zu=AS(DG!YI{RMg
z*`N0jDobJE8?gF%U(;?LC@RHBme-NuAsmOcNNh`WUKf=sF1kElGb09B4?tYqxC*0h
zU!)XaCN?HTL)w90)rl*<t&MhW3@d5JfC^)@_lpG94V|e6i<%L!Zh)=yamHesL;ks|
zs3D16kcLWx=e2U&c)Hcxj2Exz>vo|M)p%FIM3$M-Vx3p>{`-f=)61##WOrGv6AF5$
z7_~9@sebFGb1FF^<h0M3NG*Y-vWcorg3TmH4@A6ziC>Du4Sq0E=%%fVbxCy|e00p{
zv<;O-lHispzm8;s_kS8Lx@JG9QnpY@kwtSg^UiQ!SV9xH-RV52z@+>&40k*bY^ktC
z!!tU0UTO0>AA7wKVibp#O~Bse@p8#HD`nChLLigfROt+pKE;9%JPUnrjR=bU_kFUA
zZhyP#(%<yis;Q<5Y<eODLz5+o5$6<0!dJMl?t2dpUmU`*-JjH}=D%-7qgqLdH0Oc)
zda#!<t=%DoL7qS*gL~@uXTi*bsu1E<-4{_?>a&6}D{ys8wuKRJ^_@mlRu3g`D<~lt
z)$z|yOv)oU)~cQ9(Ujb>NWB_=w$-if?lk0@>-=&!q~Wd5I3a&FDzT3Tt&g|KMk43c
z{$wV&NV{}y%49YEU5FR8_!R3)M|68ZAZ81dlJ2nG_S`j^Nq(##<PVNxtLbC`>o&J(
z{C&m*#F1yewVE6o<G&l1W#J1P@wbd|zk^#SEaE5FQx<*;hu-p`cz)mrZyGu<(hHaG
zNK5#Wl9<P^(FB{OWHySoqKIjM|6K_g^Na<?*s*AmRI5jZt8Q6F<{Lk88uyLTRQ}r#
zBWWU!q&EKkV^Juk_rbJ3COXdMS|qU`m{%1a=rq1Ndjt8ZD3=}!Es+6Hg?2kX8CKpG
zrB1u3=XJ$Vak=l7jQBu^40BG`vh(>Vn%keITyeU>kYlM`bo!(BV5UGqZq{<)=bkf3
z%(Ax9F>lT_D(4YLKwm}Q==iDWp(s)5Aw+y|glP=n+H83C0!XBR_E*u{X!6ssWcF^y
zCpG}1vt)jc-`68QR0Me29>(Lc8m%3v_bDx`fm=r}giNOKX4|X?9B^-i7daJBpV_FI
zB<;YM5<hqNrm!VQs{7m+iG~H1XP4&W4N37qkF#fRQBICJofz=vMPcrHH0U!@dj-|@
z)nkG$JzKqmtCPgHOLJxFO4s9@BlT^f;<g!<!Vpk7OmX+G!nDVs0R<EMxTu<L-0}0-
zHfbLIL6hs~Mu5{9vt?|Q?H0yvzpPWLH?+3cYv_->W)VSaErW3XoqE{<=?;0vyJ^_S
zkk`9s^v3HtDabD|YK*G_hE?78mxJDr3nrH`ahYVxd?Vz%e6p8J#&*kH$r7Dov!|<g
z#5k&CwD<AJD%P#n*sOf`k!ULU0Wnop{jxmYD)X7)Ze}K3eu$$G6-?Wh<xk)=x3i^%
z)bF$2s^lZ{IgB=4Z!;eM=vxUZRLqM70Bn2+Rk#siZT08qHuLErt0_`Iw@%m!SvI7a
z79%r)6^S^u{osLuJft@UP3PR;Oq%_+g}_Q*A6UeE?>9iPX3A-RMdcxw2QIpi6<*xf
zm25fP%MeL5W*7f?ZWqs&8;8R)MO=E-L-_=@!s(>U;@))E72OG`NbSFHpbjSK2(;g3
zn7z!0$MiOM-O3d^KDVmvw*2k~RP1OHA0GYS*M%^2B7q!Od&$SR<u@=ZU=d%^u??Tu
zBm_yjZqe-0`%+ZY6o3sxrLL!F_t%#%p2?n^kJ}cw=Bf2>1fUm7%6oR4b{h|rWg)i{
zplQL<xnh?Ee=d9d(52=6ce&u!aPL4LVTX@gq80#$pr??Z4JHYLh2~$ua4cG|N4tFP
z2v|XC`NK2=`%Ra|cy>Uw|Ic9EB>Bs3Y9j|eYYyCyOOU<sXlnRaXcCbpxxCH>d(?d(
z4aNFjGSH3DYsCGx$G+MLq<O__FVrtb`%p!Z)JK2_u}TYqmu7)tF9LTnA8Z8NU@oot
zhaWZNbRgLz4``|m9}z2*Bq7sX@@CRxQ0To#Jj#eIF<z8vM8Wz27`R-_B-=f9hEDIt
zR-snkJF)YTu7Z5p&bM3uJ&ufc8t=+h@2}(x+@ifS#u4x}-@lrt)=+9%Xp!*!23Is5
zCwr456ykN3H~H0~L>!J2LS}01ZI<9lZHRU2yyL`5`VJYZ4;vKR^fXhd>tyrkK0>>x
zk!<+w7Qm53l)e#(VTKwWlLOjvNq?ZoiC2}l`C`dKk`uv_Nhf+n<VNLVl@Z)VrtRF~
zVh7))&oTnv)8Zbx24;?|I{si6W6D}tsqqH!Wq`Ud!VR8Y_HO+P`@S$!x$hTKeR+{$
z0kBoB*b#DF5PWTG@7)<Z>Yk~!e`eHPqb;(x;W?%&{g;Xi#2%ffXJklr1o8r7v|#&v
zcbg^?8%)(<*s2MO`H`W$7Yb1)%xn_xhAJqh7C^7_HaNc`cK{Kz2T$r@PGya$Z^t_)
zaHZul42YX@hgFN9l9rAP%bEj8asR+{YKbMeVqdJroQ~X>KTD+xEL8yMf8Vm-Zwbi%
zVU8#Kfc)RNyEV6C{(z1K|KNVpbSoDW{2$>~{{h}jHEDZjx0be+=(K%jF&UvzzVcuZ
z5qDzeq)Q54;}_58CqCd1kKE%vXUdr=OAX^b17!NDtR&wH&Y84~usWXj0qysE@T0eS
z-FB-KUm3~wD2mSpFxeR(=<|SFsZmcsdA9P&{{DJD(VP{3eNLu79pr?UVZtp#wDfDR
zIzrifkxt^-)2=1)f-eutxLB1L?_iK+)H~&IXm;5sG(JkY5PaHcNfdCNmYD>9BPF6^
zSUVXtckC~CPv?C1=lMAst$R3yI?4>eU~FAQgG6Rp;XM<T`>bdIHFN(QAXd&XPZ*-&
z87XX%@c}2E7>Fme%4N{!t@@ho{!%VBK5sr9mJ)%xN5%g}q#{c&xHsdg6KLvBEmu$6
zD&4qE><+nwb3cnAT$En?38?^4g1tQ!V8)ZqUxX3?1DxnLh%gk<+K?P?n?cO#0wkzT
zLz|rg<vi^}kI8<XQ#vuBTK5gN{8D&rB!lI%&Z`>m>;?j;UCj*72ZM_!lcDR|u)5RN
zU+h$oM1qs?TgqBGp$Ji;qWPn#W-34Gj5j7pvcz?aCDMgoW;W)d37m>(>Q^wf0s+QN
zA2$oN@w{(u-_EiDnDjt&v&99L3}5Oxuate}&(r{J)R;j%G*N0nE?5j-<#$TaWvafx
z-jLiO#d1K%pS}beBPMyzi+z)70CICm?5jH4cZ`G^5NSdf-~nmE1^q5RtmP%XLKERW
zESfLZZ0{^#LKek}X#4_;oYYup4`I{`9_a9Zy&TQx`0|v*NjJfL%inTSeDIMw>FGnU
zmxl2you?<rLq4QcNbFFHM{U9&l%tN>=+qpE52pg@@aJT9#1tyDUDTS_NJ|o~j%+BE
z?`(TT*R_H6{liRQx=0t)DF)jw`Nb`-F9$P4#=G=>^Y2yoNHWr}a38dC6?>yMQ7AFO
z88Jp2esW``Tg>)-Z%eo$UUoTRyZu^DjIRzGCcrT5mG=>0hnNF}Z}99d;+AaKiBmG6
zo-M2GmML?Ep-2$9+#~a*8*g%PS_22dQxI0FxAXle^Ad52$%LUR?Z7w^vDUOe;jNOi
z-ian1XsMpFbxp1Jx&ZC4zdm>)$Z5s{8zht&ul(P^sJjqUyrU7B-+$36b{O|5=2h61
zi1p5^@8KI=c_GU=;-s-dKnS+bgBsJ{%lLDPZUrtPe#n{H{@mr}6&}-{gi`s*&x#?E
z&y_z3RW@rP=JcH21=ARMGtR_KNa`HT&VP=_v2FrX*bH0LEhYyiQ<PE<t$gu3C11~}
z!yzo#EFutAl}|zWY%PwzKF}R->fCl>kEW!s>7g-|L`Jw{7K;b;Ll<ev|8>eC)q(;!
zZA{ZI3$P;dW@-tGe?qZxZu@~^YQ2zC7>b<@6@uci_$1ukwe+_2Vc#GPm4Iu_tMiJD
z>*(%4_3VA1-@E<|AsBAYyNa<<F4d+zgR(ijAF_MxPxqI}9CGE}&!hfXKM0&A_K7D{
zx=6751GF>C({wBC6FxCky3S>gF-j&YQp$vsYr|Y}Te$$+3!_{g^)=W_@U?RlDLsmP
zm-QaZJCNNzDqu>SrG=g~obi5rcY;Mb`HI!o+^MC2&YvV2VVSaxPh_IfEQVTGdNN_{
zcM+)b|Frj>K~XL3zc!nUL`8yRB#VGblq?w}O3pdw9EJe_0g)^qAW4$soU?#}WXU-*
z40*@{4B=h4ANRBO`May$Q|HV1aHy#%ie7ZMSFc{(cVGRxwMemPHcq~wE_q-o@9J>(
z?;H1<8->z0m3hLVsd=TaV@hAOITN~)hBx+ujsQK;ryBHr*84P7GmLjIF+&t5#<8@G
z&v4uZQE`^3=dzwoUle30ZI*BPrfW`FVjowg9zCs~SE5?cJt4%Q5}<Q~h^ZklOSF^k
zt<_?Dha1$J=XW<#-3uSy8SN4swbV^AN|HulebT5u+ES`?2m0hb6_H*MkA1FZG)d$z
zeLS=)HCRi#Rt%Ddx)3-dG&@%eP+f0hKRXBX62_qmxU&D`!P*^JecK6&AMnrJFOtS1
z75wVUOX~8(WQ@6et(#!24DYFH_jzAa`yt8w_;Xiiexek9!K;$SFhWR&RtpBhCIKlB
zILTXUjkB&=Wz~e^${<GRNu2_|z7Zts)i7(ery9OCj%jV6WNta;T^!I^kWs}$II^^}
ziiR;eFDWl@c$95Ex2^%7c)60bGzH}{5m3Q+^_*Is`e>wLWd)M8YE;By_H>d6b&(&{
z9i1#jm9d49lHW6-`{f6o?r>uEhe6c`cV1+V0$xKs5eflDebaTg<nsXIANw9S(odvK
zEcI~;6$i+oEICV~RoWHQGRKT1-7zV1a>$5EjAW?!m+VnysNi*yp@>v%PK7*j{tDV3
z-g)vLH#9?!NHC&-UHh2VDM%V`r9r6S&vDhoX(N2Bnx?<)wvOVZirY*DF;PkRMLgi{
z=pViKSXz!#kndUqcW1jKeV4x8>i_BLfRG7s0z^uWfG=`5Zg5$Y%v5uRzgZ@oNgZWs
z+lvpb`m8kGKneOd$ebFg=HFO|Lrzv=m{rmY5HTf&jWU2pM!z?o$|u*xVB1oCSFsy?
zSA|0@g)^gz!j`DnoEJ37c%hMUWYJ`2EX(J{AP`FEp>R;<ht7~TlaL9Gch0gQ(0k4I
zEfrH9Li#?a<2P(tsDB<v;raNyVsKm8Z1J4HndED51Lj<~x7R#n_$$31`&w5M82}v>
zQh$ExKvRsM(6~&^Wt9NQ0vigcViT-$3F$#l#^yofzXSta2ht{t7xvlrE&C5V6+W3R
z=RkHET(|!42|0qJ1{QS34lN}cL30i<w^BP066SkUiqWvOh{{@V;j}MQ0?W-WqpkW`
zaClo}Sr?t4nTPL4I#Xx^TFuLBz1cSj&NLEc5pH}{2#}MAQ1i^wMa7WgP16TRbH*!K
z#bQEe*(N?|;$wK39*aQ;*+fw>4rp|NG@C{bpaHOqi#}RWT5Zr?=T+j7ig6J#m?~tu
zKIcA^xizS5;C}aWb2~l4L7&ioqiCYoUHYyqHGf<ix_PnaAAWj@l*CbcSkVKpR)L3F
zU+~k`e{Ry$zd~l#T(ZX$Piq=@>w$hfw@dUUcp>g|0VL#pV&pTE@cDuLLwOF@Z8m<m
zsm+Z!?TF9enk;9D1PzpUBRt=9X;_9fppQ0YNr;dGnt4q<`Ds4@19FQKErdKlbvSd%
zKKSaRr}^=OC>7uAv4md0;CWZj;u8z62V9p5N%pJGfAT~|o6kVKFyf*=W~j*9{#e;q
z^3zutV`bCY@90Z<E5=!Rf52%3Ie5OLy8CXF1QDQ2H!IAykmo4*7Di7Y{&3|ERDTA9
zF%trUi5}EPkG|db+<qmB`pmqabR+&bUiQ~ce956Aa%LmFcy&@9a4w~Dz)1IMtO8AD
z;?Z94%NUKa<3H_+^_g?3!3_n1S=NY2iPxHj$5idSnQOQ#oM<cGoXY>0VxBN<h4g{=
z&n?UPhp%0}Wf%nCS}m|Vvp*EZA?zur*@%x~Sq`Ep@xbd6g$K2g=F_qM;1*Owc0fLs
zS0i&?PPWQ4Dea`>f3dyD4;AxB!^dEKI6aQEY#r+DY_qv<dFOAmgXksUCt>27esAet
zphmz~085boXE<BS%j-=CDT)6pK0$&--&Wrz^y?r036ubf)!Q99d4RC^V&@g|ulS10
zm)qh<aQ6O{UIC_ajKI>;h)3|>t3to@P}DRF8{P9?nVr|D$N&WM^a*+Tt!e+|;{<R$
z!)QTGe}^Idm)`?!RqK*)hk#k;zkTX4e<W=Lw(I|EF}HA(l{`*=h~>Yk<Gn`J11?b`
zWl{a?M!U@ZuVux&7D4U-bf8EfFw~#J?!Q!v-uF*oyD)i=<&I!_$DfgoA*Ec_E4ciQ
z690OOV`W}<rk<3f34{6XEmDrk|LcVZa=?xD`ZkbqJM{ndBf17CJUn$-q*PV>f4Th4
zM*o-D<t?A-3b5PAgn>`_p4$JC#if90C7xQ)OJ>0bK?%hoVb^(#XuF;{Pn+@tc7T``
z{cps9AjL(ezA7pz&&)55Z*0>K20Ar&jm0DQ0AS>+8RwKb{XU>i*X!7JweB6dDss(y
zyPIiAnob{ZgAwpXdC+jv)o_sW5{P762`JSipIw-p1B6Z)7nZSM+ey`s*}#muu^|)|
zm{6qIt*v*x0{yuQ`IZBj<oaCtlWn#hzGM`&$X-Fv3;0Z(f1BlV|Mea1=T7^dF0<iy
znm`!kB#P%gTubI}35@}a#<0!U=w*A0KOk$@2KA0X?Eqda5n<n#To*rbOuMYV2aucZ
z+E-J>fS5(>EyN-h&%zW<!~(<~Reus92@-y4H|aP<qg!VmJ|zt}!9$NXLH5?1NJu<%
zvJx-UCKqM>Z(ale9$nqkMJzDJIeym<j4DVB5I{IIbq>JB#dT_}S@F&Nzpf216$vsJ
z?oAUOp2*TJX@cNA7=0hSg&#%GgAw@g@DILAZ#oftz*PPSVFNo`UoHZ-865yL6Y5Kj
zAw#MFG@Z8j--y}iZ?_lgND~8Ec1ASiCb$&;zqIVG<$BKlw<sIl_rQtk=a8vkSQ|dv
zC4}jAJPk$at=uV8B4X8+J0yDaID7%L_#E1Vf-<K%ZX>f07#xW=QT@%;WgXBXAdT<>
zW8ge#)a<E*FdTJGo4u|LnnxEWJ^z#_luDN;Xi!;Q;rS`9hwV?{x_H;Uvo-)!*m!-8
zO%fX@NzID@wa#qoy;Cf75CT0wV0Fv^7|sK(V}^i=vEEs1Vm|m_tUwt6doX`c87`uv
zVAkhxaEN2P6|WNnUsR73A|l)-@t1*c^4yMJPw&y7cy7%o*i7WJZ)=vsH#qj+XLcRX
z5zNS6n9E@qUE4(qCmiRL?CtzAn&(q8(?$A`p%2BFp~<ywuTt`8qfwpJZ1U9pJtP1l
z#~?_`<#4T_d_}fDV6s>@FJVV?NV<77N?>iftTmpUQXt7Ge+&rL`$)h8yzEV>^j<WF
zkk5N7{{H`gcpY}HdH9DCc?v+9E`o@6Z@f*((RJ?{XOoVSb~FPClRm*^p|11}+7&Wo
z!EjM`_%rhr)P~*KS+NHC_7H$!$!tEFnDbIvY;bMs17=n6-G`8HIOr$@c<+UU|BkP*
zp07=_yCrNef|-o}P0Y@o=zF~N=uc117UU&lBNOHDbnf{x>t3}0&ikkkP2kYi+RWPI
z_bzwawIA0lBTbh0-lgWY(-p3XSfjd>E`<~M41oIHw%G(8^3W*&fB^BFl*q0e7Weeu
z(}C~`blU<}1Nd&)YV$q~?JAQ7iaB_~V;@j!GtZ`bvH*;Y5PnK2kPFUZKiv9$*JW*q
z_wXQeJ2KR?Eg-1hyUfGslUp+OIfv{|(K~AY3t2z)pp{AnSYI4bZ@14=Y4;7Q&G!ts
z00n_8J_NW#Z$l@o0dkvnB4V&!D9gCP#&rS0E3bt(ID$mxFCxc%i}yaxRj?;k2d!Xf
z(5E`nzY{qd)};J9=BsIe_55yIc5@#U%r0OlC>Z@$OSyqmc=ugul@IwX%iwi2Ef1F*
zAkYkdo`WVKtDWcY31N|tia`8pjbc59{Vs`%m_evc4YAjG@I#)~gE9rK$HPsi9F*nm
z__TrDN`U5_d_1X3<k7xw5S(T&%=1miMOYjYgMOo1rT3^5i%z{0v+NUNqcK1D6~i+L
zP<EyHz$4FGSvqlaycpG9K*2KHK$2EE2@?g6;+a}k0ZAtiEtu^37G+Dnm#+SI(YyS|
znx#5yi<HkFN~SX#cerat<E`Cg^@7)<!^#RY_8iI{rEplZKAoMzHU>yBk|N6=U4*^F
z3r6+4hmTObaav;ElC*^H`9{OM%TJMOtrv$=yK4-*2N$T%@zWU2qGCDP-@LEJOyzSH
zd-hB7&T}=AmyDo0ijsF0*mRc#;2HXwN7dYJ)Pa@QF2@`2_ZNc#xnd2KE=L2rFQp>4
zdk!JyH`is6m7s05y07mL*EkQVk8A>f4Nm2e&kD9HOCtB`^^jxL00EkZF~aER?QV~C
zHPKdsmtB7d(9lV@*_=Mb!OMPNFd6b$S1z%PhExnEK>E%x{}{H0At0q`1-=Vy*?O2N
zJ@*#hT|L7OW$hPwv-NORM#?vTTbabfYA(jO(+m)c<FlhzrVN38<)$ulu+4QIT^|`5
zui%p=Z~PG{9=iZW^y0F-l38465yKjMay#oJh(VgcV*AJooS=W%W54xvObi#iF(!Gn
z#F|oP_q40iSR5UYhnq7^l|skg;IN&LBwr(-GAHD>qV9<mFKs#1sZqb5urpoBPdCm3
z98vfJ@?T@^gtR}_M!nEq4Z)?Kc>=@=I|RC*`ED-}BikO=7pYfy9&Fa5hiKW0^L~h3
z0NHRKosh8P%F&8_cz9vEI{rOL;dbkqMKAx;RzkAdMTr&u9uQ_zChv4N;>97ruf_Ez
zKa-t)l{=<;?dsa1DZr=P*dKhEn1e;nY--YlfF(j+F2toiuQWV-Ws}d_)o-mz5iN;x
zce;4O)At83*=?r3yjevmz6u{xp32nM-JB_ii^$*`zW*%GKingrE5`mOf`b$MM+dEM
zp2NJni-5Zo6TZ;dC&7cW)o11YhO+*P;PmJKgTQ|hIK)1JpX@rUw4_0-0PXWX2%I7E
ztscNqSP`K`8fDIbrK+YhK9+~jJ{ZMOooR;uz|6_Bazn$O0-;Aw{-=~2Y1BWY<O0*l
zNi(mncpDQjH%03^7r~bAQNIyGK}F65DnaNYwF<Uz(d&aLj(1M6OuN6Q&5+_t<JtPg
z*y$K+1x)_&C`Npdsy6ifajnjHUc|x8|4@-r+M9dkaLoJya#})GI0QgAfNsWzYcPV;
zVQ8#m)l0dZ0LSs0UbOHdx{x-z!A>nZ=z8_J$?%G2E&$=^Oi#oED!s{YioKU-b_l#x
z+>oCHj?3A;alvAFMHo7~RXmda<d`=e<$JWve)6Z5*Z+WUY@J$@{V<wZ&%`dQQ3~vm
zpM_w32HhTRR9;bb$W9~=!LXYbPdZu~G~WspzQRaq4m8+lZOk}|m6jJqDPZD`&cdO{
zY8g!`vjpUFEtpL@(N7q>4__7`Agmf6n`eO#-f?(V!kg-Jw;BK?Ovsz2uh()-mzzt9
z%3%zi^Wek+I01UvANeT5KfXHgZm2rE(04hNKRy=TC!TJ0PXT;Nuh4C>i7r-(OuLhp
z0C<2a<mmtoSz@Ol6{Ddo^cmS;rd&I6bPu#p)vwjI&J}BGfvb?Zo%QG}n#pZV(v2xZ
zlk$TY`K0IPMseZuHC4jsVd}|J?$HO@;tb8&qj^UYp3YCDm<J5Yl?|x))n4V)FUXR3
z{>OF8?615a7V3gML+4a~kl<J&6goK51Z(Vjej*dmg1jpyGs+m5rf6{4)f`eUT`KLm
zl9&9E?RL9XTjPAwMrv|<q$RIbb;vsTL&uwB5zAU@k&o1+{^hY>yut>}ShuSc{9!F{
zKI=d_jfqfdo<YKm;}@a#W}xaBg(mDQD$Vt^i2~Do3iT({4!~P{ox9s;31<usFP@$0
z*&e+%@+4ypQ`>R~xtJoJ&zEIN_{^Erw?}K(Wv@hWEcHJ-e3MbCNs1=^4}3!>JO)VI
zCatp6SY|4OM!AS#WBRVa=Q!0P-)`onlHG4cU5IeB8amjxTPCBwTkzV5kAf_7e?+X4
z(7=epWB1{=cf?QqbH}PU)eGZ4Yd-(+ybQELao>oP<0*U|d#L6~dm@`I`|U|@K(OQa
z6lA@bawU#Q(?L5|&b*F2E!A==zlFze@BC4Ym))oHt@C7^7B`xGRBXwRHm<P)zbjb~
zXyIlFWpl0q%$#U&Z~c30_BtufEI{6L89gSJxT~ryQcN_LH&X>o`44r2Q{z@8R;JVN
zEO#JNPp2V~jj~0=`p~SPWRwy|Ifx4tJ^nqUw<vU6Y%{wPtyn%yu*WNG1vFRgoLy{C
z)#ZzE>RZmXP75H`na%m=Jv=2kAD67Ye=_FtUa@sa^TSSXuW!FDYl}q-Czf3_5K<r|
zjm>Kj?s>W}g|7KYuueNv_0w?eZd^2^6)}>`H&`g$Tm66&RZ2~6ga&DdDt><do?YhR
zSJ2jO1Z_Vldys(xDT*OEuSTz$*9a-r3$(6tI0=UvQahn@%ml-e+UN|v9*WScxlDU<
zril6WuXCN!QSYT@SgDFvdks3-u)OYy8iZH72*&^$bdkK?TUz^OSn5l6?Y3K*3lY;-
zMw76q^M?Dcgeui~?lD@m3N}E14FU@hE&VrmT%_#wLfVR>wk>o)^CZw=8cF?Hn*p01
z0I@q5xfF%!Ra5%z@db_>GsxXf#GTm){ts#sTdySaxk~sl#%|`gB7*Gc93_MSk>pw|
z4HJIV<{-3NKgbGD9f$9q2>v{MlX*|{7idF&3)+ZS9LFQR(z-K#`F^)ft8-zwb5YjW
z9n}T9cZ+A2Kk2Y^W>hIfHOckvg&*jI&%gCsRTs#qaO)z&;au%Q3%vBp>=QNZl3mNG
zKNyu#X#j21aQ6ojX?!HB--JP3ZrS2pcL<JFbAh9;m0lCa1=os4q*!kxZgoz@<torS
z&is$dO&o$A%_J#dR^G~z-McTs>O?IS9Z<HW9{fY@?sjG_Iu=g!36U4oU=la$#Ioek
z;;Xv#4Wgu_4m&jQiTyetdh+V6Aeq<8pMqqC&oouEr$Jt6RR?G@8tNQXlYr`A+(jpf
zW3UfPR{0&tFP(!#@z2cNGLv77+A%~vJClw4lzC^RD&(tj*$cip5?Y^#zYv@JUF{B7
z74$kLqJZPnXqgo9>L+4D>Jjp;-aC$QkO4Tq(n5w6-aAOj3r-b<We#kgMH<x68Ox7F
z$&pHt%YaS{YM$Msc}B#xf}*WDX``}LP=;`9<@@6V%1mXdCymLOhjzoBrbm-;c40*Y
zKclH#-bh_i%wl*gI42LPVbv_QO23=Nlgo|&ki%#O6(>O={c)buNtcb2#Ge9W$E4eu
zWzRx3zS+EfLwnbq?laoCtWYR=5s6YzmxB4Sz)|EMbI5wNCc!EXEr)PQc4{Y`Um_=p
z)6jl6@`@6@NZOq{9jRas73Z5YA`411xHbfNB6GN&+&Etb$(&%b^}hFGp_Maxc~=bE
zT2kfRL){O=)4iqlRpi3&O;^>(EGO0)niW+uZ;`x`#iV_=;=cgBh-#SZpKECJyxwam
zTXmj?`L->d55f_t4V;$3(9|LCF^z^VDL%zT2XX;?30fJFLUYXcVD^Gcc4j${lWgQx
zW}|7j@C1x8lC0qZ%D2c|BB*zvr?#r!`+mk!%UyGba)pdNzH6<mv|VfRV7zyw^uR0@
z(i1mJ&HEiU<#4Kfmt_yEj9aJ5D37UeK>i0R0iRvKDxR+=wo-U*(C!5|kun2zCLWbC
zv|FYkz08m_wdMua=WDi#4|CbHtz_oLEIJC1BRB?KPv?O#9+s#&isicbbe|C5@Y7j?
zi?g__>Kfhd+Utu}8V8|=?p*X97qmj_LyUMGy_;Z5$~V2N?|1W#?+K@`NP!C{>Cgf?
ziN3qFe`Qh4`}lA((y@L}(+LmWF^cdz$8V0^P+(^E;lv6lLfsyVqxa$*ZWMIT%9E@9
z6yU>)-u|d98==HTyN!=JcV7mJ<BkPdH<^e<^-xPTZI2Mu&aS^j%d^qCt*#kjzh;>}
z$o&2*G`8_aulCmhTRr6!c<+h|GAQk>LCJ@}CcA{YUI=^t4F|;BQA{Q3o=yg5cV_3!
zt3B%N+Q(`<mzBf#H(00oEfe7LGF5^49vCz7vOm6x2Uf}TEE=JD)L;r11<jOGHA3uO
z^InajVQR<ALA{0!$IsT@>D$9el}tU>#w?g)oLWKwLGL*=4~aXBz}1s;d-MKh`W?{)
z#sy3bnPyu;Qax12y{3c)_xtm=kptV6p*<XJJ3(t4txtMgD7}PO`IILHb_7RreKy82
zTZC50X2!@_FzvRzEQv%qOY6665XYlRC329QF7}hd2ZCn#o9@K)=Xm~hAMVOH2GAXV
zg*CvBWv~f~q7ta23Ynx>FH%Zh6=acP9ez+q;_AUzLqCCSlTqrt*}He$aMza@EhDZ$
zbqqg<qdHR{uky!IFYA@EbuNJl{x?>=bQfG_MeSDzoYC?ZL}+{o_=gWBI~$cVRl$uE
zV2vUSGp|*{-qr1^!o@Eygz=~5ypJv|aH-XXb9icO3`~dyMa~}V@a|J1W6}_ai$1?M
zeqhw{lJ~9w;1pwG1r0u5F1mDao_t}Y|4!{L`P|!k37m>bR1qMv4b$ru*~JEK6ME)$
zxFk0fhGYw6!sPx-_%*506BW)W+rXhBJpz@uI*CVQ16ja2fh}YGWeas@P?%hRt6|-q
z-_$YWWJ{7Cb(`R=E{!T0A<_NqnU*{RN}*$P<J!;*bB?{DfiILn6DB7~6hF$M{poSR
zIE@c5@JnXN(n}1&AAS5G!BF4!0lp`Dhux|{E?4eax^G!hTwF~UafzM5`>O9Fri8dO
zmu#eqoXs=x6Bm`DZ=e3SD$$tt;aN@4^xX@wn&tAPugT&->lNoklr)qgdsP~0ls3_2
zT3Nk+=kg#yfP+Q)8+^NU=!Yt+#txUiFN9h(S4%R%0+(U3crZQ)83f-E@}k3i3DO%9
zAwttKYqp#2&Ya)_KhWKB&=ib0qQec1Y}g*!p15dsr{woIWeYDP;Bxa<qHg*m@3kcD
zE!*Mh)+K25Cej>#P0!jL^fCiUr^)JG7MpJE#ajQ}y^g}ijGrd=A-H50=@OB11lON7
zsvdgb1FlaYVHqmado@bs6xGE=<Cmm~R5Z<~Ld70UdhcF%&*N(9wfICTP>HZ>eRD(|
zQJ|@69_lWPrUdOgU7*vUWECopaxE}!b<TbjnC13-SrQA*k+0m&UaT_?#c_Zg=0|yC
zGvAkwIML2&^9Db^T8mTK`52;kF8+w7Zfst-#Of88;nF#ajdeG~$6_!UPAww7*%^c`
zF~^-g^yDO8gGzcl+)W{`zS$HG-+)LL>TR{|cRjjOSC|(gMS?^Av;#)g++{$)IT7Uw
z^F}2!#j3>p1N9+RJd0|V!uRKWQ2sHG@!a*sSgMPtl*)L%ZTXwD>rOOmahAAs3adG0
zzpnPX9v|IK!k98DWmmg^<@|Cti+*E#TSFIavWA}+(KuY4T2=XJP1f`WN0-e+w*@63
z;m2wiV^{GUy4l)Xf5^uW%WV&NEQ+5{QQZfNjIYIdHlgnb=+t6r9FYff5`Xueg3Iqy
z5<RZZR?xl4ht?CLsMAO_WY%7sTZ0cZf>AR*`tuEZ=Q7*x{Jha-F{ElTevLwXSVF}z
zBpr4_;#d8BD8jSM_Cda0MOKPr{oBQo9FVJu*f<)yw);NXQ^=LOZb@fhs=GT}6FbPK
zEI+G2;{$e)#zE(cUd_Ul(C^GW@I=GX{T6z6UtFTFTDB2WNgdAtb<Oi0v$01MY8rIA
zi5u>gi86N<a6M&vxih~NPdkVu4x2tZ<{ER&w1hkGxScuOgp?pvxFDE0K8GZ#zDJd-
zPqyLitO+r3Sh%#+du;RMN8&^9)b_brLxr;`ImCi7<B9Y84BC>6m(R*Fhj8`j#C?cP
zv{Y2m#UV1yRag4gL5%^Rh-i`XvEwAU!eac5LXydH64jiCGJW1w>wH)9wa%Q$wjCr_
zY>Q5-3KOTL;zx72kouhzZ`3i-Y>9;}aP|Y$lk}2hCbBG-b932}u9L0tgE(mx?YfZq
ze6f5k;yZ80ic}{JcWQ9SXU?z)=jk;#E>Kr4D6AJN;$T7s+{KzJ*~fDIf*euu9-osp
z!3NdCTc!o$6(g(GDligAWV*2FXKFl8|Dp8E&PQjOuy+@J@A)CqBlIpJ^$T7vI$dot
zB+0TUd=(27M_M!;Z-Tjxk(}AN8n1moHfy8TRk{U)4->qEJI6rEEaY6V$z&NP-?UrY
zELtx<s+akdCo1Nw7D+>x!bunq$s4WtMa`Rx_RUq}lzz!K1foa*)tGbJjKccOZpj(_
zY6WIK4L#3^;#F5$<qUbTG>(X6`x5k_R4vhfTJPfI_{ltCm^NA)gEv4wM&Mxg9Cjx4
zKD<bOO`5h)IFEXKSAEOCr@$ZIadw$~F<GXib=G}jqi$RV0#@FPcWAo8yAfWvT>mVr
zAmQYzV3#YE1Moz;b?Ghg(25DP$0yTS)n=N830(Je2*2qq^0AOK`LtlbCmq4-$^?8A
z0d%#NCQpvJNXd?d-!o)xldm-5hbFo}(`+Tf>Csj@CYgNBKUgTtr$}Itm?3_+H1sOP
zkvo(#@-C+CaH)B}Vl56tdt`H>j=aCbpTgDJj!*5d_Ff)C{CylH%}$r9lCZJ<-sYAr
zX|+y!Q4e9)`Se?Ex`R#>j5BSa1Wy%7@fnB4iZIyjefyNfr%^Y9E^J(KwXNrq1Er8$
zlYS8Dq{Z`xFya|MBiF%H6%ToDIA#-z_pUOpYo;*LdVd;}wTPG3ejT|Qb_UAxeOwaZ
zsoxul-&WFNrO`M?%BEX$TEwlJXkmoogBJ`-(NE;QZxC>F>gisM;p0|^=OA_D_0EO(
z)Tca}y0Bm&@M7f?z9KH2veT>ft~@I0JtmsN?(qG_@5ywUg81&=g~<ZB*UwcnsuYE&
zH-;A@8MVpy?UIDHEo4=Pw^@^x9*)@Z)=VZoqUBKfxbcWasCe03wq@kF^+|^iIs*dp
z7&h7sXPn$wdZB{t8Y6!2erp_@Wk0ab?|ew#7A~w5U#3AwpP!$rO(dr7bQ_j;a?{aa
zki5!QQk8X#RfR}S{WS|_PMxTQPli&7F!AkN!Qlc&Z>9CSM>B%TzrG32mvM_zJZ_$)
zOd}E0q!T&ge(3>zLAkWRq{d~#Fku-Rl^b$eigAdrxkrK<q~X|IeIFvDwpT_@J|9z@
z*DUZ&ISk6Vvl7m6h?w*+><Y?S;Iw%?S)+X0pCz!FPr_TH1^4(r338p?6&VU>Z9*fo
z@q>{$aUHyKQ7dxoTzY|0VhU{%cX#KGnge+z{;)23s%hRXjeVt5{$dep@#9J3jUr!H
zZlU!y@>{aVtgJ!Qvij7C3&D44fiMDA&62|arQOj+dxTk&qOWyvM*jAJvG~sQo{(?4
z#O|l4yS-_k8!-Ih3fhQA(Ehe(zwtAV-_dwt&qyI!^fruixoxagx*ofzGG7#rm%_uJ
zxTUt9<E3~{;KV{bEZ#&KSCxFn<++WOJxZdUj-*iH*h}Z)mkx!gs6$N)YT4ZpHImeR
zOH6iCt052b`a|;(n_9g$#KOs;6Vj}5NKKQwWz&vyM7W(}IZTDApC2#Jj%^*hPkHc2
z!+Jx9H;{K)3vo4SztvE%z&rddusgst2OMwfkV3v*vVeATjLX3Wtu%!yPNKLq5j{r^
zoQ>6sdgJbsR$wWnGjf09d@R)a%*p2}&CYYeI{E4`+mN}92AP9XG2)Y=Emfgb_qY>A
zf|Y+?#a>@+F>N`x8*jGWIa}#1<)#o2>0vTmu6nZdn&&<jU#4h)GQsd-zzu;+%$L|}
z;kIjN#%f`77;6pxgvgbiO3N~HB8EDh-b11mCT6HZOeGCOxI#EQ{%{>1o=k2qsTQ5!
z?Kq_5o%F+gJee!|ayh>997o{&R(h!Gz#PUOsL$$NlNl$3D-*`P@wZgdmlI)WE4$or
zVW8SxD9aviUl~3-=+4mdxVh4J{1^iy2oJZU)*oWz{UgJoP|<tuU@ikpAQS#2b2ztp
zE(I)(>~l0sk6$uVd(?tUgy}mp(4S*K%iVFW1l{w;sa3leCF@M>()Ds1QU6<_A8+HI
z2-N$Z)wY0cDtYzF60{_-ZV2zn>p^skOwz&`o8J=2t3`{R49Z8p|KO?;EYv)bP8FRo
z9u@mx(!*)jM_BdraBNXlN96nHp&oiP<*ug=wZM#X$0VTy&gA#>KDj%k8CncAlj+wy
z_YQ<9w<)GSw4g;D!_W7_Ub>Z54D$YILD`bQeWeBbF(i@&%sfqq6r6!m<4O)e;qc_Z
zNw_mAr!lWIEf5KL`lTptU91anA$x8{JcH1dX0L3DisYnfz0-08zhJi{O)`=YA*Hm>
zu{@4iLiSc1q;FR#(KR!oFI2T|FqgO=JwM&!BV%g?`L+BDBw!^ddiy2QjC4}*HRb3n
zCe3RkV0s=?gNn)62;~$C3e(<(w2%M!*PluM{c=*KfwoJ2_%nx}+5G)0U;`PIFnhKf
z?N=FpPy7S511KQGxcu#HY5&dg*KfmUn2b_&#@N42#C(HAAdk~&boci;XJHe_M>up6
z|EFpHES3P1kS2*Vr2XOVbCw_hQs+b1epe@8(ZVSD<S9;&<nMES%MPSA;_Ck19yIc=
zjj}qKl|PA+Vuk-p_OdkHH?0T<{~X+6@xN}(-|HVhXSh_%IV3+NyFS(A=~#=;=rG+l
zUQ5<ONO)Sf8t}W4M-)D(wa;yO6p!J3ePYQ}Z9iK(mO2&nJF|<{0<BNlHAF`G%^d++
zpMli=^(=J?`*&s!u;Tz)Kih_i^!NJybtR4PAAw;9&nrlNYxZe_*G49_{#Z!AZvQ{*
zBkBuuq*YfFj^CQSvIVG>-OW9u-y7@<=w@BeuLl3f@mJ%`KtQdguNjbj{o$YOJ^;E|
z#86nv@61Z@{S>HG)zwQRq~BI+<+ht;5Q1ob*QA(TpnF>&5|NO8Tdl85F$F<khGlxz
zx33Q1hJEv_*$MtxM^R#usaDgJ68$R?K&F2vU$fAD5nMImH(&X5OW5u&(^poC>THQ4
z;IKLw5f<?2-n{K1xM_sPekR66^`oQXbY+j9R?`d+)De~s$1OFUEY(+m4~Jw--dJt>
z>p{zCN1_^QPqzKb4S7nV8<lbp!DZo4-%hyYb60GN1cSWAQd1_7>1m1zK74TQxyR`=
zLzcs2xaVP?>Tdsv<y5eGk<>hgZ)c@BwJsp~QL@g!T`U@ayquyX_46jtjAHK0!s}Z2
zO)Dcw=Y}$l*gUJNG<IRK9@mJ|-73UPO}Wws@Y<x|WKM(o-h@E)5Hy3W28b0jjlWoi
zRZ`?3Yu}VNiNxx4Sl~BL?Y~pojRCKCUS)u_tGeENIAQxmnN??JG1I-hBafP+vI%BK
zHCJuzDuP343x&<vkG<{QIoXW6Je8J?s(3`A0q?7*4aVz2{?#-0Z)jpBE`IDx<~8tZ
z4867VaX81`DH!HJyk4wQ5;PfQjKbR0%&~g9eYNkzs(*U<gP$!&BTx5w^B(!ww!!Ri
zU;FE|$N@)B_f8O-XS(3EmW?14PkCJn{8Ts7bg~eXJ3n941{v~v!t>03@G-f-R*&-4
z{*-BS71b4W4OtBb_PpbIHP&gz+ss<qNtj>S>E>KZo&Lho#SdYTtNj+-2R}wX_&0wo
zOF3GGet;*LUb+ZE$@vMewWVn01H<ff=8Y%y!8M`DToCh=o4qC;-wlF3&&vZp{mh5K
zrde0Jt@Q5O(|sQors(&KO!`qO{~SdF<Y~%x>)0l$T*(nz;gHNKbrjv+!;jcJ=m8S;
zvu*x{R*5CEel?19yhn9qjn`7FV29@^M;ADVjUdmG#Z$%B_eGH?)b-uMzHob0$Y~||
z<jlLF#Mu$an;Ld|hl&GEsP;m$Y=YQs(-EBV^hc0>jhBpe!wTp2aBigi{Gsi~TFOdJ
z)?ZIr04jD)u5gYjtKJ${5lFJr%(7u?n*;oMFJDiVHsR@z&@xI`p(%HLZ5X)5q=C)L
zv9+e9CUyX^@v<)ty7A-6bbND+v;1XI%2lfkpYTBrq5XOv;SS`0V@5XbekJ38&PN}S
zQ`p3}4{QcTfTB`OxZdi4m?l)!=m4H3VE_F)$<<XNo3;^(4+FPikzW!Me0Wc5b*CM{
zKNV;CdCD8|Xr}3CVLx*E4oz)+Rrb|2LeZZRAHHW{d7r~Lw>Q3u*HWQQ8gUAo;~_q!
z%rEM4ZSC3n9{HFq#-aV|A(T3zmP4m<3XO1JYpm3qDb+~~({?5znFrj(dU{Thp$m~q
z*jfL~I}J%3uX5k^D_>`Ggd+xgOl#Q;D#xURI7j3X9$Qnh>sfV%h&TYlggKyCRvUu5
zr3;Pn)E-Pt_Icy+{SDgfuoSSvR=x?v6^@yzhNPX2HrAZBUwVhWwwmf83_{FcF{PW~
zI@b4e35h(L7AC7K&U&{5Zb83*yviM{ncy`vHDtf}db&4l>c4k2igB;Td83L$JNDr>
zy}q->iN<Me2fo2apreybkCvipbVx80^%2{Oy?`fYO?7A@k)fZ3|MfvAZ20VWKQs=F
z$kZNN1ikOqcqf{LHl|e=qMqQjB|ZZSlpDW{{-DmaQ!ly7dgXFuOLo3U4c?Esxq7`F
zrqy*Kp+>nq(wln)uuW~p0Rr<fC5rIfcSIVcVv<EWQXnoz_rx~;h1ICKSaFP*UJl`-
zIuV1$9~k%4IVcV~%ofG>FD&Y&Q_2SSXKw&*tBm~QD}jHLE4<Z1e>Dfm9Ad-0gQ!_N
z_etgPYg-q%@ltTKwH>z^J4BS-Sf}B?Db=azz#t@PhSw4yJe`|r)IsF&S((q%k4`Wk
zZI795@9CJC0eIKFDR2AA<3;eHNB1t<?m*qTR#EM)-AH|fMy<Sd)9F?S&raA~jbg2*
zO}%J1N6<b;k?Z3)?iPrKK$8TP?PP5vn?d6k44B8!HWkRmjZwYd{nJ_$2guzSF^g3G
z##sqv%HOEAC>xlTzIUUaZ*TbeQ_ohP{}2p?CYJ4>gps8m&P2$1I5c3}b;Va)3!juW
zcj1RkS05}XC;OfsA?$(3xrYG~^IE&e?_Qj*Z*%O7TNCa^aK6;awqV1<V*J4D9@WBU
zUrY2zZS1~04hJwW!n&XR8W>}&0(N!lLa3hATqs+0-^Ygar$Imrg(NGviA*P26he?l
zGf;_c6~9|k%=R>C{MeS;xa}n86*KAi?5Tb2G_HI4&4lgs-iva_-J8~vy6y3{nFVi9
zEt{`@YZ1nE9~uouZDZ*|BAqlI_5P&=<W8HRHm?aJO!^`v@J()S>UIc+A&H#hXqJNI
zXf#<m_lP9!SO&ZH8cZ`&?A66{T+dP$72MhCWy?0<%Qmk;USIzYRZ*fHfniD_yuFH^
z0Ua!aU9(Re?7)4q4mDfcwzEgs-Ba~XvrAU>DSbbcU!FsXRYOk=ygEY+?dHO>cbks}
zYvPpndm@(??Q;i(hCbAPdf5K8_~_x9+|MWLvGT@P*5?c>^PVhx@OHV*H`|Gpj?YRD
zJinLrW*p6(B~KK%bCO>xXZ-Mh9BfYQ{^8$LUbJ6`rYjyty?yhEyY<9F_h33N6AT=l
zgC=r+^?74^rLj3ul|kt~mZ^@0#*v+%HJddbdh*vZ^AL5V1Yc4hvF-YzuS-y%0~qz(
zL*Jg{u<7#72I&i&fw|P4f7ho}N@Mq$bGMy*<0E3DpSK4tHn_6ATCXW;J7~*OBwh8#
zxEV;fDl9R$DyeBGDzbMXEJe-L&6~(np${i%cC50HuZZPPILcNF1TTCP>Vz7N*+;s0
zn;g@#dUPfG9Wf+Q^_P+EWE^@HA1O3&)K!Iuign$AdCc2!A00A`+n$MLZ7A{|NECS(
z6X`bEX{+^!1uNEMNrq3Y6^!}{OP<)NP1kHk=fjOr{_I%VlA@hf>B2u)D>}hK;FqpB
zwzK<vO2RSuqM0txv9{V+d<vVlRfklCD@7Vx4p;9(SlfW`Iui?!eU)vm%}VIG_`1N8
z*l8d>^?`V{?PLiI^is-4aN@6tSF#Gt!i93RZ3-vo^Y$`4h)A#|28Nd8(zaOZSX1iI
zA*k?HvtaAnMD5F0ICKmA`cBQhXotFSx;i@J;kFm*A9EZBi*t<o{cqf#IG!#s8J6EX
zzUXZ{-q?nx2wKF4C+<{&OYt7W-(%>TDuyRckJSn9XDg~1syXcLzq%}bKap>ZZOAbu
zlh~>+FjL3cc4v`c-KF>_(p#FxLMK4HiQMc<F7u0gxjn%iYIddcX7+Xj%#SBq2EWN@
zS=wWiZ2(;Mb*9E<rcH^1*qZl0&5B__i{}IFZrz*{uW05m41ByS>1)caTnaiDK0gr>
zx|l9f?LrV+4)kBHZLoY`)nw<TxFE6h^>)`iUWD&dDUs-Y>m`{DCK<0j;9Q^Sc97ag
zYqPJrqt0<Wt66&1G5A<zgxa(B40;6vBF0Ra;-8%MwflFX3vE2At<w#DM}l5-7Wr$=
zZ1yMs7{p`Qt%lM??2lDDFF$|S<)<EDRq1>WYp7~aqE&4*A=(x>widY9X!g7D+IF)+
znjnB8qbXLOdcu$4xE66#SN*Gg$Sqml6!Fv2OC=#6$v;l7;=@n+fMbl6_bfA36Hg51
zFEpnQZhB$<r4DobQuVY$MXhp$N;Y0cesI{_5j!TtU2TT)#6_()bq`F9OUX1MCI~I6
z=8rFCiV<u~d@-5GVJIx})f>=q@!M%_UAtVqSI1#c`E_7U@0-%+dlG}5FyX1~ecVJ%
z74wA@N3HfiG<yK-`@B4I|Ihh-WzZKx|Gk660^${I2Oj8tWe?=XIZVJ+q5Uf$j((JK
zNCl3-r{9vgo#kk<!7lAXZ!Rh4@=~rKWM>Z6<Aa0zP1U<o=P9;Ajs^NV#qVVk6d86k
zh93{Hgj<A`dT&B3h*TYxd43JHPHWitp+<rynHhd(3p{pb!`z=VuobQgd$z`klv?g>
z>pLq{hO5)42;Ji@NJ9<_l5SqwGU<ME8dj&sYw{fl1iql>T{ci{s<ZU>zcIz9kaibA
zgK>3Mz6+t-c0Pi#E-}42V4e5ROEZ|mc*?CK=}pM%z?Z~}C_-Uzc$dgG3-i;>X7yF-
zTfoP~hWPh``YZfq%TjnSLL)8&(vJq>LXAw+zQ{Mz(zZ*Sb2V*Bkpml#^B{lTBK@k*
zE!SP-wda$DCEZuDJ?qgfe@H1*AD&*G^Z0d0h+w@m=Iz&poRf5G0msqoj;9hY?XUk_
z?a~Nfs=0NVGBzJ(m?d6pjfL@m54D<(E72zkS6lpM8Qe>qIL{R7l=PxIsXQ00EfXKD
zS_G$)HyVDkAvvcpT`-M4dC~I?d$I22mYzj8iSJTK$K_t=4^-I}ZoTy~Az8XRZb<5l
z>ceU(*A!^9EPsuw7Wb9AKaXvRZ<R~}vzpc@u8j)IofRE-earm$(-Bvti<hk>3etVR
zqfpuBzY&1&T~cIiJNF5$vG)U8A?(@fOlrv7R|-c9NHPP%Y$Obh<{$MLE}>=!=whhO
zb10=}?{;i)UpbjYKDPGkJJHw+lq+4}9TG99d1-@K%5tc-oBihFW^JFHLFKU6n4COb
zU)-1_7m4CcK(06YHJ$oA=>+B-eFZ?Lv%6AJ>^EMrBf{?6=Zh8Bz@p4_KEpm)lSVu+
z?D^wR5*6XVPt~4PPaPLq=YjZ##+(M;<M{e})4P7QBYf76+;=CZ6*@**CQ&A;JgHeV
z_XL}AtyOi)m*_xUCsJcdvlAtDkV)1caj5}HqM3l7Q%Jxumnph`o;<0b<uS8qN#BI5
zkd^g*5!avP6+O#UXh{7r^|O(53jQy^W`@WkvQyth6qH@VZpcxcz}#pjZshf>#Zc<j
z;a>B~WWmVvJmeKl$S13i_5G|&o);`GGn;!_-59**{~b;E1Jt&~r6J|2cZg04EbgSH
zs=@BS61=>xw()>iaOC)>*Q(+rBV(xMh{0WA$F$P9^Mu{Z<)P`AKZL+|+e@Uqq4Nc{
zuPz>p6Jp@`Pf%^n&qvd%;7Jofn)W6fot#KWz7(4aU5J<f({<uIJcU#R6f@>SB};JI
znH9xJr^`fz2XL|c>wKzcwpsUzkjaJn{2z#2)&%(=QiCWJ>A<{V`bB-LIjT#&ZwP|W
zoBJb!%l3r5$4VHJgs+yp=H&do%ncq1rATj!6F0V-)U8C))HSlD+1JeHjDJ4cSVTXZ
z5*KUjDpavauppbV-pDrklly5|->%c_Fco{R>i=sC^#H9C+Q|N?8c^O4=h=<>m2Rz_
zix8TpcILyp;E|~u!9@Eu`wBz<X)8&kXoI}bn#+2J5{NU4(;X7U1uF^)pc{Zs`#40J
zx_hkiQeW|DrF~vJNOk;(_kGZsm|6Xacl0zDH4Uz(jMtN=xX=&f`dEyN5g#$ce&b4!
zu?&zvC&WKhRe**sIsJNce!T<!^qc&28d{V<<BI<4Sp3T+K_E)g3HV|xNZBy{^aA`Z
zh0xdn-j;ryAtueBf9Cq914T6CHu;GTUc_%xC+-2BtD1DBQuM!EPJhck00;2KBpFvz
zkp4Ecm;rEI*$QRo;r<$Z|KYTGaO?H!w0i#Jx2f7M0pHbxk8lIoZ~Z{e0Jon>$gqpi
zZ&NEj0M4u86JHnZ-&)ix38`H|TKAJ^01nFfm)~DP6p+23+sa?0*C)~(RdTa`JxBT5
zMt;SPf28%y82_Vb8!Hxwiwg61R{;-u2~uY&=dNmZ7~MaN@SnnCv;dD;*vb0vua){g
qa`|2j_{9F-KmYwk{a<Xb8;t$@7qb3v5?3VPM^;i%qEyT%@c#o}JO=3i

literal 0
HcmV?d00001

diff --git a/tools/profiler/nsys_profile_tools/images/html.png b/tools/profiler/nsys_profile_tools/images/html.png
new file mode 100644
index 0000000000000000000000000000000000000000..c3cebdcc9971f14f7ea1d75141500d50bd825fc3
GIT binary patch
literal 72163
zcmeFZXIN8P(?3iTMMY2$4<bdXbm_f`NH5YmNDp0l3spgibcjGGDn&ZdAruj$L+GIg
z=`BQR2%-GLId^;RgU|JTd_O#1SCZ}CYpt0zYu3zf&CDiJLrtFe=EIvfI5@<L3ePoh
zaIUT6;NVl;AjIAgB=sc4!MRxlkd@U?l$E8^aC5N%I9lW2C`6`!u4`y*P;|BWyND8T
zeykhHIG~VLTF1@xzV(Jt`UwvK-aBdo<F=X{`q19cvZyEZ6w(uT?<|bS$t$?ueR;BF
zA*0QK#FKSJ4fwmv_kcxO(QC8&naB)`wHi*&`q#JLJ@hGpZ_Cs^>4$~UkWtBZ#o>rc
z2j6zZ5p`5cji<bK?}kRk`=B}$5GOsVu0P-Y41>BDU#@TBxPzl{qc?JvLyfSP<>f{1
z-Ka`jX$`{0hN<tYEn&Q>Zv@}%t&_8;^Vt;L;d~tq7!|quD(eXqt~6cM)F&&P?%FSE
z(4UN-J-hEdF}hhFfZO}_3B<;VW98FqNafvhhUdOTk_@{L(Vn2<v?ql<HZ00MHr%5x
zel6M(jZXEQLzj->sR>+JCher2Zc*Lbx8E17r%5OpJ*0ajW1;bjW4~b(Jqo5jiUwP&
zrP|JN*ycr4*`)Oy-HaboB)JJ*VQh(GjL05{`e1FE>V3=+of@t<p?c4k*EWqaeM<&R
z!8)wv!=ZR~n*46vVDP8>HAO$=TO=R)c9YlXUw^RT77r=!nr)O!bnosyaYhEDE>CSI
z-;nANsaQK%Bl_SjP-Vf`{My%a-d!XiG2m5ggcav2zjut;pdI$%D!1=UP@B)(YqM34
z6ZurEF%kW|zwqYiNY933ZdBmbetURmr!DLKVW6)sYdU^S^C$0JDG{cO!RId2afA0<
z``3m`IZF{Z9~#^F%DZR8M!m^28T$d0PjLjhH1vfFc5mVc?>}sg6<g(R*1)k^#I?(S
z-i>y%sosEuUt>I|3nA1FW*)ls;@fQmltvieyIMAg^ftXThd!P#-8CzmyWQ8=zZq%}
zD1D=-z}F6Ds~~xVC*SsZh+r^S(3Qv-Ur2_#o`mq*Qy-$vZ~jk-WJqZ~Noy-y8wovl
ztkwO{Lb@zoi2TM|nXejn$+wB<c&hOXuct`oX<XAL9)H&Qtl*>I`pq%oxwnKL@2(Rc
z5WO(xRS6R?$DhQr`KBU_6C3=Xy~mXqO!DA+#)4Ztp(LKrcc)(h^$%!*&Au<)kj^EQ
zdrrk8`^P($dodl472n-3cuUS;tw5|9S524M&1r>0DJN;+`UhKhEKj?z<>}LeKkhJ-
zRo$$*s}&;h2GHSQ?)-|7$MQqD{!LwKhbYhQl@&abR*qb?H;lq1f+gNIzeD`EV+mh!
z6_x`(9>iUXi2jkcn7Smmc(m})wY1)Omel!CTvzde>Mqx@xj0q!d+zVZ@3d{%U8seK
zUD_L(1e~D)Z4spMq@*`*Sifn1<Nk)`JKuN4m#JUWJ?II^8ak{y20G3<C>DtqpT16H
zl;>q^h^JR5%&p0F)0Eht-@x4{O-@dZP5!L?h1|^ky>`PGPhp>Cm?mw{o)uXoi|<_n
zmHFqA3eHS$rq(=-CxqnQN{^T_DkC+O8>D@oqv=#r1VJ{6?s<C3dP+h{D23L<$Gz))
zPpwnCm-~`?!0`=_`Ph_`1>z5rt^3LnOOquyXSl_<<Ws5=Z>9Fd&ph$g--()a_ibtl
zX{r;~YtabIKD~y1%=T<N&refi%yaB)EPCwK!&n;y%VA9bQ}yR+uIUe<O`)aFlAmQg
z%lmHd7FX{1uyjJ+hq8Ku-Fl1cMHVJvru=*AfH2Kw&A5>_mFblbUiAk32AKvubka?e
zA4W{7*2PLmy|OwnMUnPj?8`Py$D@lr+w%+93d9<Ioqk>8zgf{3>G$Ep?GU&L8}Aa}
z7YGn|^AN(%Wfx=5X5Z+LZ|@8fFA(`8H|{pR=->nsEk6t`leQfh-B>xebMm3Ku{OI_
zA7Thm_SfvvV9_j)E|6LK$fXGSn2`%HsWb6bQCD^enZMaA+U$Sw92g!Lb*^@%c1DCR
z8)g@FNT^B(B~BqW2uFu|hLglRdExvb?S;ULy7#X;Z!hsLNpuRSnv;noktZo7QRP<@
zHnYM#;s7Y|ZSgMOP+x)llYwp<m8wToqE#~Pa{JQzc>B!zdK7*XuPKZ;MLAmxrw#Ir
zcIuj3m|Rv|bj-&s4m+ni(k)E@z?pE1CCjSm{HfKN(n3zCS2==OtY{7oa_EKdGTWx!
zmODB;Vz}dXhv*K3`h=4I#SKe_rAmg0;r7=xOO#!4YDa2QYF(M<Zi4P=?vmT50WCiR
z=1<S|BaA(aC%MMC@XI`41h8zA>$BB!lg<LpadVz?4LjFovw7a974|dq>(Qjr(8p=f
zkY%Z5u}c^=lX9DtTa<IT_#zdMH<9v^qLMw5o|2X=UM+n<qqB-Y+(4tV6N85wcZyR5
zR*?&>BdrX<k4R}W?*0-UV$>j?ti&;0Xf>Y+y)JVi6PlB6PTGHSp5pXr$5TeW<vI;x
zI|V!I_0gxr7586d{v7*t{Hy&Zl0lqaP<}R!KZ;zwR=(#sXH?Pq^mj*5T<>Y#eSK#@
z>CUMukV-rAC?YO8N{m#fWyt?$?A!L}ZHoJxn?{cH!Y;chOhw8gAn<!oq-|HW!Z+sU
zERWQV`|sGmD!RxFA|Yg5!dc)<Fv!?@$7|;X^sQd6f9i6gcKU(%6qB#fo<fw7l0oki
zc#>ILQ06AcFhw+h0_^n^VdUrGCrkL4;qg+nv}ru^x_(guzYCylNL!!rQMX6$Y=Q{$
zO;YOa&`Jth0o&1`R~wC!3a4Q{PrC#Pc-Lf|wRa-L8^mSi53QN|1UdXukx3VQZvAe9
zdq?-D*(b{WFzzunXufHfQeOnD+1dO)-tG_pdt|6=m{EI#!s`ni3H9Fk1gjypCof}t
zbkc^3Xr$kufBEvlZ|zV=$|7+2VzQ>qroD{VEV|)`o-&7Wuacz8vSZUp(zyDH6mTZF
zy3nEOY_-|Q)ri38q_)vXLYqx5-AuIFsjapXae(+S14q1rzs~EgEwN_-6!@%3`<nai
zb2Xb7c>&853fD^yzBo>nw)nU2L`uv`Nchj8CiDQ>DcZo3w2S>rd-!JO!?5&9U_DR-
zleH5!4M*Qv0!`x=U01pjAo4_5*!9V_hJ2~46XMljd&bs+=<B)e1UY$UnnTfPV;5bs
zxtvK2x7&}%X#IB`zLx{L@MG(;&B0%!3WMBGteb_4FTT7sSkl*08kKf;bVmU$MiHEY
zoCSe=7l;K)1Ye5n{Xu6*c~L!|of-7${!;1piQ(Gi+K)%aN0<i|T(dhj5bKS9xbE&T
zWh*}rP0L*4h8Rn_|Jdl=D2^#sg>9D$nASl8H@zWnb<ZhpfqB>Z;egRy8)ek#j=O}j
zgi~FUvq|GXBd}EHOC5x}5~+6a={!ViRt$>FhXaA!YxB!z0wfhAQMA319R9iIUsg$$
zq#hD(Umv{VM{FAO==}Jc)OLL(se~mv;#Rh1R;t9PpDt$h1f6y_J0XLcqUDc3)&LO^
z!-Im^Y={|UvnU#bF@tqkbcUNnn+Ew;?Uwe=gdrW_HE|I!QhwRHfo`9U!+IlRX(?&N
zq~wD91k*5-7Y(P;yP(u^Tr=8$zT+d+Zi=r3^0)+^!mLEWLf3I~I`E{MiEscv`(q@W
zhWal;0eBaYwyVDYRo54~AL4wU#l7*u@;=+fXRf)M(l?6a>hUK#Uo2Mb|2o)~w62uL
z`L+J*S1r*azh`(C)oBgqkd_O|klao9Vk?fkEm12Dt&}_##p+t?E83{4;;><_Z{XlY
z0B{JfSGd^EL+l3&b`!&Ku48}iVL#7v@c#Mq+IkNDKd<qrE*r{d$to&hf3+;#tgW5h
z?OZ(c(I)5Ep<n=QeGh$A6;VqUC$3jkE*92YK2ENeU2w#GM6s7n)*i3ue4HGe-9>#Q
z=>KdXioL$P%}r1DXA=(x33`214LVsDH)}dUuBTj2=_PN{(b0*!S=oqcK7a9#?%4lH
z(A#-<xQcRfdwY9xdGm3(xY=^^h=_=AKjr1-<>kb-;B@zO_ITyP>Fmz%*C78K=ef1J
zr5nK21K{FJcRB7W3l~oh33~cVLjU^r7oXNXfPYhRcK^q;uoL9Ie8SDc^_2TxV`ICD
zU)~kf0Qgut8axL$Va0=`At@vxApU3jzdiXk#sBE3|L>kW0zAC`+4Vmj{dZR#cWXCU
z7bh%D56OQo%|AN-=fi(=6z9I2`hSSxFFF5t7b|GVo8sL6S~SU<BsU*#<KRf+C_b0b
z_QBnrBLcr4r~0;Q+bNpb_J^23Wl|;wlQOx1=92`?XZ`>)?9XvsVeThlMOLB|dQQzL
z>z)>>PIHcS^VOg%_hwk;YunYq&BlGP3uq>WYYmKQf};jc_nX!vU555*GM2V<%b$=F
zkV@m=Uj5Y_5c2Nb4We7Ok{0@<$mM_AP8uhq1CL;xEV?2V2k!=5@Ks;Z6gNn7pm<hy
zLauuH=RIi)T)cxwIeq!x@gq%li{QqPEiP}w?^&Vyii3O7L0`gfB^Q@;;;&(Ot-?u@
zy_(TKo9GAQ04?rL-2E+`Zagfn2EqN0e=j`+4(VXYuQ#@SBM>Yds}Siy4q?LI%qE0`
zbh6Sqx<6YAPNN87*DbL#yU(UwfuxjpLoMbt$<>#_4q7dHz-eeTGf||Y{7hC>4l~=}
zUO>*KtzbV<0zj1-RK2)EE6HKqmwI19*TbXw6oYK_GOMx~81ZmK$VMM+B_DtzM&)yw
z#b8|`<;hK2M(_>Pvt}q*M2qmwqbL5T9o2DCr4%+MWDBOuV`GF#2XRl2Svp()jpvmL
zGs2qFW(NIMK5Ak^7%cGgVjn6c7Ey_Om<t1?3OY?IY)zCR++Liz;%%szrTID!^6)O2
zgq;~Snb0=#<Oty&tOn0>bbm<{wnI3yvR-S?4CI9bnRR_!+AYbLcrCJ)avvE+NUY@P
zYT8v}3#A#Z#V+F_+w0p`@)AI<@TmUC(QCQ;E~>;8ir6qYq5!MU7LYYT-_ZdV-N(Mw
zrdAVdc&;{4u+WtO?Sf~k?1qfZ)P@_?#B7a}kXcnbH9@wWEu(5s#ptYPvM=g=XyLQn
ztU%ngbYWOqL&K-+Yis+7^PlZ<=2(BYzwK_(;_<3cv3ognANY+6^h8&Q{s^-9HvB=-
zdiR+Z#MY=FZmfkbOtQve>`MKnqr_#=&s0gdJ(tI~oVkGc^|L)`+rK;hajof#T*z>6
zVGEdYB&#Yv>5=q3v8HQ=WG>20WhTBesU45+<qjya)o&4m)R3?!AR9OfTdGFA*8H|a
zia#$U9fzy>Ns`1CBuyy54m(q6py-J(E?bY`g@BHgI%VYHz7@bYCIIH}?%-@yK;$K(
z_6rG2{y|VKD7mSra27qtvZwI=-VPnmuS+IavN^Q+D(1T<@FK&KdS2ND^AytAx*-y0
z>O8k52p|Ha+4VkzJD#=8tNAF<Vt=BVlObh6CK;pJP5?NASxpVIyeWDHpppo$$}-NV
z-f_p9%F2$cK1wk_fO-6m%3-~uav*_sI|M_Sjq}r@m_$F+X@1x~ClPR7da)(RK6?To
z>TVjgZ$>-G*0rS8BP-R!-HOy_w`kGlv7dc9)o7WtJqs~eTw3De5H7c5lFpuBe<60I
zy2J#NogQ`!;n*yuO=pD!%$4KHJ8e;}A8fb7tXaC6$`3~kxdqxb$HqKQ$2wr2WQ+?;
zlmbO5I1AG}OM&(m4?Xf3$z3HOSulOj;r{lv^5l1v078DQCOh=ntwvgck`TYStSOn3
zl&(9I3r(@d3LB^N7Y&^)?kGWs>&Y0a!Rd+>tn>B>C0OQci1=z7`UfsC*QSb-YTKs}
zD;DVz8MC<pxv)4&9ZjknbtfQo0h#cjXa7SL&q1*~LHot}Ty2<pYdHc;=WiJM(^ovz
zL`dYIs#Hh}?64gm=#f#y&8e#<<(GMYlErvO4d%d{231r0Z;XMUWNgI`XBmOfdQ}nQ
zF6q_w<4Wm5_N*=~#~;VOXwH;v-9Cxo=+A*FsGdB#!ccb~;i>(a<8z86jLclf<8zn%
zoWpOTY8blHLIWoP+XJEQdyYsa2${JmARN+|Tf06GH&(0cH0Vd2<j$!j*0Q|muZ&r2
z7|5`8J7bz(o6=hZHfCj=XF6;&iNW;o*9HaG=4QSoO|J%Yf^nKg%WZ*k`Iy6%ZS}sd
z;+O;Dc`shW>GE^kK=JBP!o^Hr&}o%MmM{910pwLp@{`ftDex;K!8%_|V(gWMo0RIy
zSlWJ;T-aVCr+|zF^oX)yHeWR(W_u8c8KHxm_)B_d+X5VNgbKG3C~?hOBN~J8$EfuE
zBn~7RrgpPr{MIt=Qi^PA9kNAp2n)HxH^dnEK~*1qUeW|7-?_@4R8~nZE;LytXg{5t
z%AP<?lp3*okU^rQC##P(rUujUO4^1ys;V-lCg*D%zKsL6rpy+tiNOwvT;^Hz?;ncE
z93<O}Z2eJPMi?mYwEQrWawc-d@2GgGCuZ>j03MsMR(n9h$AD21r|{HL?}GBUmA$Qk
zg0^<1>P)Yz`$K_|phNxiB*~gCh1s||PgRG{t?F5Fqq9CUI|eKgeFW>HpXH6N+%flv
z?CA8vA|5t*;DOZWwD2=1P56JjAB?&{uVq_f?H1)+GlILOO+&)WY(|(bpPIo;{TpCS
z8A#mk00eFb40$#SJ|v%@Vo`I|M{P+G<Je9R-#6>rD)$`gPg}`4e(1lme>lzJo#jUj
zhQ2w<Oy9~hZauG!!)b0cx!W1`Ct&!WU#>52**$$37X2{y;I_7?%tF7gi0wo<;f5BQ
z8g+rH(qZOV*4m<@@}`7<XC|GaJ^rE^B)4;+VXm-v!};K3T{&7ZV2qVul{#`6o|*B-
z<xbB02HY#vsuQZojvPDOp{*er!gP6>lV9b~aT#+$%vhe3guqKa;FjQ`Zr*3>#xA8(
zFBQ<WWUvhO3F84F1AEVFQRPBol$R;j4{L&mXY-kTw#LTkDosgPy0?uPB+9h9U_4gO
zoBLJdlkP5!N_xbovC$N$AG(jstqYnQ3AWD}Vf{!n0@xOMjfjRO<K+^IboR@{gR8_5
z&v^|T?6R2TZ37TPMxwSFZG$Ig&c~cbgPU5KuFYqfH(&D8Jg&M?|1kbgoSV2~vmuvh
zvJB2Kv`EmM>FF{97HxzUt~IFEvFM1qLkm1h6b~~^YFX}#iLI5wBsgL7fO4cC^<kqh
zNzk~9nD7txslu)8zHU@b4xP96e5wG}7tNW7P~XBTQHZ?Ol}ZHu=3>vpUx9RR6{!KT
z=)j3O#oExhn|al9M4h5CCkF&gnf!ce^WK?}rDsKP`=oj%FvQ?%eqm&_GMgQEOvJ8r
z@HnZc{pc1${FtYrz5k1tN~@m4N^j&mzo$--b|HVwzT@-Yd95((E6I=0#i!wR@ZV;c
zVpfYcW%uP`nd$CRgSsCXA8C%43yR2GlswY5ccIVqa!i|RwPnjnj$JVjs#$yecr%Vx
z$`lKdK5Wu86CS~Vs5uW35j*tsz^-R8#O-AsL`d|v*bxhtt*+72(=&2#sDvh!{!qJ;
zx4-ODH#rt(rhQi`xJv2%y2r?k#hJdeNA$l(ysTKn%l9@#?JE287aavtEaKgh+?Kn-
zr2jP$I;~6WeUm-&f1%Ta#oo)n$>{&NG5>=v>HN!60J*N{|3b$XJL4Wq#e?4*O0d#X
zxl1RV&069S^?N#d*cm6GUU6UTN-vRr8a-BQTKXI0zo)Z&?ZyxbvfcFe>%%LD6<dDA
z2<z|Z^xPuNQJ;@!{`~@4MM)2;Y^10GDJC0iTTkm3TpF5QsTLa4ghCOgrHhedUwN%N
z#S4wUmKaY}tA!IIefo(aqoYB1Xj$G8Xw3FHNJ_VR)d`ZThUgvnRarziRih4cg5Q5E
z#c51`O?F=qwPlDN>#Q1)VxQ=X8mC&(?|;o|)L1_|w95va<6v-od4uu%k$3az%}Ac0
zB)7FerUv(hbC(?mZ4l^-Y}fJx4T3w$-6H;PGl-YCREZJKmZo+b5gK>io_RArn_v%n
z*fX!~X%@?_ATD(Y*DX2f94SBX&a4Q+&cs_o@FAqZRE4#CBG+K-^f+5@@-9>vWR#u!
zt~+dRZF@IS%FNWZ&a4SJPsNIokkBpC=3to<t81jQWHT!I;5gZ&)gsW=bx*E6u|{3|
z+b-=uavgKv#=LQ@c0HV6otE~ztRD?X=ubN>YJR;yliT%uwG@)Og;4J!rZ()YVA(mC
zOmZFGU^i2sL0fkkB?`Ih5Zo7Znq!5*&tv9}G%je|Y_rvfj%-(YkwK-YQlQe*`MGJu
zo<+H;y`cw)uT8*MfhQMZ(MZLMd00=F_hI$g5n4#9kJp_q3g=ADzmQV+zcm4u6{}}S
z+{J_?%_$>H(f%`!e$5z~y@bxbdxDgNsxo&s`khe}YE>m_bLdSsH^&fBH~^+AQXh$v
zbBeehn+fI*Bh_jw1Ge%aa?L{myhMtF2(Dl6tLQ?;r#J}pcEh9+%ePt?efJn|OZ!tL
z1un^i?XUEu#=~BS)1|N{r-Tj5H|sRh%?x&Ae(ic69jCr`{CaWPo0*HNN3^khL)qoS
z0KX8gpO<rH&bh7GpnG}27zkp)%voYsCBv+m#sI5r1miT5bSeU4VJ?$goZRIMpk($9
zS#E8azncm!FxCL5RFkkjVq?f^WG9Pr{RxQ`!X3kUgzW2X7nyTpV`ml<cvLya`Meqr
zNUe1!urZOQbu1Rq6vk2|V=L-l{|Vx<#g?bs=*-NhC86cL!dvy07E(}`IUohtG^X@1
z0+Gknd7I^yj($X}2nv_8%RRdVhH>!`L&fx&U4|$R$XK9>RARNsk3`%iKg&^z)0+i|
znR%GG!F?0~b|K5jBMm2`=ed$ys;a$DfiJHBfk6-Gq>&MNcB`n?9uS-}-UPzn<o7^T
z!p&fSZR8>~R`YN~d6prI$NOV{#zoyhYfW?j<_uZsI_xImwq4Kfpgoh{&-$|D9c1T3
zb!NF)-L@x6hsrTixi>l4=DkA?=UkcEQ!Vc{+P>u_?$zGbt=h?2kkA*Vu<LV-_wNln
z&vY7z3*bfgQyFvLHGTdNR{T>^_XSte$&uAmGb;EL`4;0;=Wss5wj_D1>AdZc;yBZw
z-g5SnB!gGpw(m|0hMCo}CGFSwvWxXR_eFQU69mzJYD?RqYOh@wZ?2zCvqC%y<z2dK
zZL<y&5?Q`}gka`OA5+udE=+;VXV`cjZUhea>KxGubI__~GDWM9nswz=jctt=(F>Wt
zb-9XP_L#u+ww%OJQTe+=274HiM~Ug<YzmN6)m)=4b|yv_ni;sxz}B?3m?-;sDpk3f
z*NcmwjM}~m9x^A0L8r?=W2qN91a~fPI&VeJmMIRi&YpI$rD^%-@i;pO#I>41*X-iR
z($-d%!Na27*}L%3MJcSWNHMOgbfp+c(c9~p!N?yJ=zIP(u(q{BYToDK)^OT@y3{Pp
zxWIzQD@XwI0NrC;1;KbGO^-t<nZ6_srpN%x>i{2CFKV^RT8h3R5X$ah3ei>2vfRb4
zqMD2q&V7-SrR?!q9f<DxfjS8{B`$aTg&zebLTg~CBkYb|?*zi%dgrut+g4Y~orI)u
zD&VMSvUYR;c`CH(ZPtg~fT`cFnFy))c{LjzoaAdCJ^uhtLbb&IP(H88-gt3Xl-5dV
zGKf*RINg%sUD?WBZdDa?0^R3gee7h&pO{hZvUGm*mfN%GeHW}%u_Yi-Q#5;7vg>3v
zs&4*5)9(XSP^lsRFF|dTpji;f9I^ftsDf7m>x2j`Y_#>FK_w}9>_U?{ujl%Ty64|1
zWW-@WQmMT$kqmQuwE~lMuDeQ@5laWI1FZsRw|XmA!Z@?5uz#-|Cb*+?##<RB7BT&E
zqhuuX+J2u>Qnc@4@|0$cra{InGjFwYQTa}=>ddE*PHJWx$qQ88kpMCp)<JX}WR7=M
zo@QQ5flwa}?N3KVm_XNQizgf_eLrUgl027V5w>mi{fO)8soxw}Or#PqnN#TugY^nz
ztfVC%`}D}<g-zZ`YE;6*7!ZD8qB@SfnecN%rZAcyo%u)T_-cDoxCw0LS;u);rSp;?
zz~5SyNFWt^LYwg>4xA!f6{wi|=)^Z==hoq{AP}J_zeKPOtU33e5^Y%e3EF<_QSxxo
z^c+cuwnh6Dntc8=R`|LQ^UI_R@Twdx)dy4DVo$uIv}wz@X<qH*EzYcalty*a(83bh
z4X9Fg;zz^>W{GC3F>)9+o0?Lh(n^HZwgW!PU^Mm*GrYRHMqBG}Um;{$1roJ`Tda0n
z)66Bs4g*iMb{cD{bnE283SEMP+vkNxw8heN4w&1q{*;qe<$REEUA;4_Qpo*4RF}<+
zc|7859DE_69`hB!|A*<&EckfNaWw6Cb0IlbvMQaV83|;t>dbwBsQV+9RthlPIc?(y
zJn*v<q4fXW!h({erI`JELF&J~x=NTC$o1g7VKy+|j32%@><;@xq68dSwt>0aUVkr6
zoXl-<n0@9yT5B!4FQ(OBAgcKO2Ug=)fq?-cml`J@_hS-j8{L=G`t|1)@|EUVpV!}U
z4_z}BSB<iNQ1Q=91CAe-rNJYMCH6IF0WNzg{_H2_?FF=r0&TJ{rOp@qeLZKbhNa!q
z`Pj#*JODv&#obT?_jGe}z@napgi@$*j$_NWrxoU3iCduQb=|tKZyWDnHI;o2$<F{k
z;hNQ798%8)=P-`0dSSiD_rTfAo#(b4cYtVo^m??&-myg8hpFuZtOvmDhR)M01_l?@
zB3=6|v)X&dR+RfdsR_{A9#Z<KudiG((t4uv>=K^wI(>gCW@$PaE6(c(R+X!QirdxJ
zr)>nS_ES>wcqLNLz?h%<ZA=AM%y+EMqF8+`zk^r_*FmL*E;D9Mc<GUSmE%;7t}UbE
z11ov9*BiaJI5r1nyrw+ON)HY1Q0;$o+HbqaN@PAPo>^JqVOc(dwxq;AZ&)=-GnnuT
zTPeTqkVHHL<0V<3K05`v?SEU#1EmQ_tRFe=yx!z+CLTmR^_@P<xGrjG>LNOOwruC0
z^wAb3xN1bS=6rJ<KS1DHDV*_(haw<S(yxC`Z}F{zkx;_V&hz4_Tg!v<nX^^9pfsvG
z({Y2R>Zdrc4q&dW2`%qzfxo{mGRXy@nm*@`nhExTXHhSACoC0iicT13B$NZKW&%zp
zs&)*1`GtMnFO7rXBeF6#!{QEoaR=+nC6Bkoe4W1YqnG!Q2w7*$QT>b943}>)83#4Z
zl75*1oGxipkNhmn$Kqv1hG)q~D&3)h4ZE?n%RgX}se*d^$yk7DwRc;u<qX;-#eKj3
zaO4wAk2WVAxh?wvSZGPbyw8x=s;D-aD=F0iR(H!@lEd1_NAx}3SWB0*9ZdZ<ycsM_
zc;gtJB*eFnds-O1XC0kEIltEWY%iQqq(6%JxTNhlakxVcAT{mqA*E;c1pKSmVw2aI
zpNT`*exvMu0^HL<&q&_(r$Z0G#@S)`oIm)B9DX^%P=JS3$#_E`kat6+`e18Z=4RUU
ztj;up-4pwn@Au!Fd(YWGE<keZ+x}f%s&j3=bMYw0qWciOTY(3>8@l$or3;dYNqD*H
zab7zfg6UKjz4@MwnJdh$W!8vrC&Az{-1R7C=?(B2leO}+3y0%mr7Zt(x^d(?j{#wB
zt<BTxXPdG{HR}Uu>kMn5<DwI4HiC^5W~H7b=@$7$5ve;1O|{SmiF!2aaM>blwu0gv
z&68~(kMJPFzBp#<#8iKkB1)Fb_k;u2=lz=JB%!(TyBvD_4JxKo=fV6%MgxI_WChRm
zN{Xu*4?p&Zp`Q1Sj9)Z(1I19@kbtlr;_RtBTTty0X?pazEMJmo*y<osdePD$omG`{
z8GX`+HFkaZoMH%H59C-cH5i5vabI;^tAlw$-OfBYFNHB*Any;sl&?657yFt_wBlzi
z4Out)=F7Q=4)CxR9OS2vVS9;EogZb0PJTN}ywBxAot=us4>R6pA7QB4jDC3hvug-J
zv(0)cmap+6aK?hPI;gt+VAlo<C$i<7CO^+*`iewN%gv`@^LmB+W1|MEw5<Y+<D5cR
z6t5oZ1I>S_IIBO}ldH}4!MrL}f>|IowU(1=%Ai5i{djaks{6j`R`1H8gA(Y7bCxev
z0NRks65i@y=JjJKVznJs>AB{NnXe9X!t8KM?M2i(`+#}iw|{Y5gE~vu*kn4qc6MD4
zlQ~=|-Ij9w)%0j1>+lY>*R}QNhzCcrjv%UuLi^DSJGDKEOypf+=?MY!8YFF@Dv7>Z
zZwmo2hAmebuGy4idrQX=Hw+9=p7m1;`}1{Kf}qhED$aoj2JsS&P{9=TM$K=ILPEF2
zl_mMu#RT#)e7%D6F=($rmys`O0+AxGM<iWj@K@3=B&(uL@tYMFmi;c^6_`ZvW!urU
zDdXL&`-Ddb3zt;KxrQ$1vX`ZyYdhMUYdx8L`FBS$o}Ij>SPOqwr*?j9o1!YS(on88
zpmfyM>whFSH6T!&CAHL|B&XM#W3oDfXUISPe!d=p-1sUepe?(9J4Bara|X0uVQaQI
zIjFhIu!~)-!vAlpMVf=sVT<0Rj*(=$JZecymr9f;uxE@Vx#hL5bPq3$xPNmmd?+^v
zE}NPa9rfE88{Ql2C|{9JnpnNQAFl*_kNL%pUVmTUSQP!a%LI2Db29ftu3Z)?rWTk~
zWvG1cN{@{cSdlB0l2b*(tt!qufY~vKO&5n7^4Ix=`RPb_H-MtSnTp%<@pf$u*W>G5
zO7pAYY=NS^;_=@3{b{HIRmEIDWT)z>Qm154`PoEh`9zoM=3UOmzHU>flH-pglw2RA
z;Jy(f3#}9b23c7n-^59EvWb6<gWv9T;5VzX<y+T6c1#TO3TCux5DtOm{L<wlLTX%k
z7wodv)2VJ0+&eHS`LfmdgD9!hRJnf&suijP(C0BbQmCCBDb1{IEm8~>ML%<!Ih?G5
zF~X*>!Q>S1)UU6$UB#?rN}wmqWRe-ICjt0p;o+vYnq~HS>K`BNZz)qlEpu(A&>6|b
z8Dm%zdX`%ag~8jCPhLs=_c0Wk_oTrFRyKzF-qiLD;H%Ww(VSdIZf;11^39AkKTdl~
z%}m>r>Sk*@vQ;T*tWEn%<h19!(t5@@?Er0@_6pA(etsM=8)WNUs6Tk$wOp&X($~jI
z)oHPYoZGa+IgX#?D+hA871kSA|8;hX|EcwWYTOR9ep>21-EL2O%U0idd+Z?^y7ehX
z@$Nx)P6k4Gt6va`^T*jBc^bb{Eb$P=-lQyfEB)9HAplIsp8%NK>=x9ltM~#d$|R+A
zY<<M6kuCUMUmepH2lM%xw>S`3<ZLSuc<vM#z=sl=feI*!$30fPuJ|!wH8^N9#7H&j
zv!-JH=V30kjEH4s9i{My8IwMhA{teaq|{H)KPGFuvoqz8{ifb0BJRvTg{+lQp0J(l
z=_+Nwoj7MujHg~<&NEfel$|dgJx@()@(ey3u6j46s(RN147~7NY6cw1^hblrQYci7
z;W4{;v7_1C$oX?-rg5&CwIppRRwvXU>S<o!WZd6FRJrnWKMxiU4xZ24zB}oHS7PY4
zCi>vf_+ib3(|8?ubbjx#>!zboxnlFfqcfP4UesVe>{G}$rTe`K<y6KE^+JyG)^)H#
zgC8xed>hKx_&B8PbSNTGl*FXOPjI})Tk7=Zb}jOvY$n}&&P=aJ*Xx>zRJx!ZHXErK
zCTyt#LAP?f)=diFrr1fBYi3zArg15#l)SC0MJ^&|Ym5>^@9n3#(Ag~*MVC+Lur-YI
z28bgt1nBLWxEiY3ETKS_yqCKcYX$Sef-T8}%%G-JmjQPY@xB9O;MZbL2^!L^1}T9)
z>T)lM4(-;q-L*u4PrkE<xcLSK&J929sdCk}cpP-x^Og*(G$}Xm!)xqzJeoK1sz!jU
zqqb)j`|Wl0r^bz6hBWkOj+-&`zzu8_)1$o;*R`P3a?k*?T!NN|xLH5gMbAOI&>X&Y
z+SGl9DXrQP7#ON?GKN`h-O-z-G20aG`n57Q1J7Na&RY#g++xWCUOJL763pM@pCJb~
zo`y+E__0=xPxV7%zza1K#F^!Z;h;>Z(T4dcob_Djn!?ym2F)V<muP>qVFud|2YH(%
zEpD~=;&MkuUlq3Gx6z=PvHR~g(`H8owq}_5R7K<DBbu48QLDsso!-9T4@tc4JLO(x
z+)1Lc(Su;8xlbYe7r;V~)_sG9HUicCwLZ(e1s9<W*Ynhq)mN3G#Ygr@HVlrl+8M1T
zqdVF{RSBSSG&GmZ;1y;UMSDWm&w@lUaJQlhJvK182Ye*M98NU(4L=ABg3y+E10pT<
zZSp-?FC>z*O#7P5K#G{>e64!raQgNxl}c8RO<1Y@=;*11V=UD^Hqv50gSIdAT|gqd
z!&KwkM}t4Qxz`=td-#w(h}*yxe?7aIZ)~|nN{H_S_M_PG;{BF(9TI?k?H?D*3+R1B
zd9?X_%7sSIAMkdB`CZF|sSEH1AVN@ebi^rC&~NmUNpa`i5*`d607;7|VAgrjDlzWR
zOAY*^LXy;=FI)Ni@WfxYQTxL#gr~4<w8n(i2V;J*vv-0S&2xtrI7*_D@%FI?C|NJW
zl#Axpn<*;ibQ^h>+1c01EGYe=47fUHd6U48*<cZl^z?k2nwzZiDn547>XZk^#Usd%
zpcUu+U^w2YHwuf85o_rDGN)W)N?09FmJsalJ<(|1Rt;yKCOXkZ{Ca;KV0H4f@#u8Q
zvo4V64(^{gB$qM0K?vg~xkCuh|ApPl#3rKze~%RZG6=?LhH3+fh4&-iO@fXSler0B
zaZld#6bPJk@{z!qxzCXdX`)LkGrI29aDU9<kxV&}1Ef*Xb2GU~U{@LS`rz>gb;+Tw
z=0KhO#!A0A=HawSwc&3$ghdjgQ%#}=&S}D)vA#f-^4N@R?V7f(T0Oyvt@I<S?;pCB
zvsgNNXVK9JWsF;BN?W8`3q(xOqOGSnlcBqg+g<qVy5h4)y4N*W=-up&6Spbq-)NT*
z`l${6I;q7gP6fJjlp$6ROEP?7!fVX0FR@l2$LsOSiO41hR%9r%2`h8NU(c!<@=QI>
zY!-t+4t^N2!^aDf?4BMU5{>rzKuwacyd)>mwE32bwG&v?@B`vKqUep<PF*#N2(GQT
z3QnOuz52K}9>LHb7TVP}LnC-nG4mj+zAy7$Cm=iLHJfguryT(QU{k-K6HU@aEa?wC
z3{R*VY0)8LHJVP@W1`>7@xe%p8mv6b@jh{ryg7g3Udx2a(JhGGpKn#SWi_BzX%T9@
z?vJ9b6tX_(oQVg`C&f3d1u;mwcdCOIPrRa!K4vHVRG(+U#t4N?l<?gdR7M$q=L6!t
z*0Z)=m?3#g1B#_4r_BbqfZ*d+uLu$Y`<?hskn5^#_yVcYjt-$m`EgaH0aN$(_Nu~W
zNeGXyc3!X$m3+>ecYxkK3;+k-3yK|J5ik76G`WL4Gl#w?xdN(EJ|V3RK&&1&j<egC
zgFI61;)#j@3woGnEk^W=HEWZ7Zaq4OF2|m`?s@ZCF5XDWZv6241vM}7C$mH>3d1!l
zA3Ih3<nd!)Cs&wnu+`#jXIEF>IerA=$vTf#xm_quwU*wft|Q!9)3>85Z8wE~X`l1k
zPBjYTEim<flIg9gDf*M#v$>G1<jw*>*p9DT3~`RsP-~a(xcZ0dO`-rLB?^)Usxv=J
zk1a|O)$=XgvTlpZivEbbWyMK29$9zGMH&q89xPMmz3=r!+hz0=o$&JdfUHS$^y+v)
zE=Nvw0T%3jg_Hh)IR(~+>>Kz_%=Qmck=<p^4odj`(KM(W%&>kq0lzryG-*Bj=`alM
z%k)2*ia>87^VZx6z>?L-?k{#CKI;|e*$4a~=<jQ1z(4)zn!o)8vv2KT?RFp9#{o3$
z<k!M_HlXa3eO3KTBH6ol@8T;ep6=50sFpEdDxXW^e-h|-bTm^w5Aao0N>A+3g{4iN
zC>5@kj4j$sw6r9$`L*XfYgmp0X&VJi?-VO8@|aCk$5som$}k6r_T>W$HG5KK<_imO
z(KkWKURaNb`e8`r(0uPllUdy7E(@!U3rWF{1J@1MRwH9v$~1L))C)6mB;G?GScW!e
z{0rmBKo~yr`q}FBQKyxyM1^a)LQt>=-@L!4)A$xbxw!+lj$C1~9i3vFUn_Bdvzi!T
z{ni=N;Z7esH+Nce1}H*X;OsYl7@&2M0m?cDT#-{_WWr_^SSf26;zPeJp=K@i_Ai*c
zr6gb6TMJYF2na{SN_ZO5(42(ZrGro}q+Vso#=F`8!>QlJ4w6kaIn~7k+V=X+U=HlK
zos0xUUBsxa2!<1oVyoDqGCJrV44-UzW~{Uhh;8Fr0MjUg?q*KQ@r^MRpC>tq9Vw(l
zt!%4omeADa%b@$jpVynWkS6K=IE;su(8zj8WaU>G)Y^aEj(u0nYOO@UO+U_V<~A?H
za+IDdh%L}C+#E0ANMTDRvg0~uzIL#(vivtr=<u(B$%U{<OwyFHzdig{9zsF!8_|#%
zSp7(4WBV%P^^eUNoQ+M%q4sZe{QB)scv{$ys=2A&;LX3={J$cw-=Ns!AO?NS!sqW{
zihp3fkju=^({gs(w|@tLf3dQ#h7G5ko#OF`|2@?9*Q`Rmk<&@jet@M^T~%!JCo8|)
zvA8}kFys%KzhnNtQm@kHH%W6y`nP%w9{)!!_J1&S+<=4YC)Fx_o8)&BrhAH=@by*C
zm6G2|?bI0S4YSSY!Xy62*#BV@Nk3pGjKAOLru$o|1#(FT&qKLM+hzZ?uKyMDzM+Dh
zaEPyo2>b7(hAVZM#94dr?&ohOOpuRFReftJ1K<Ch)c#LR{=SO;rzZcQ%zy2r|5KCy
zf2&FSD75PVY5NQ{B^l{jo8sTr56it3cgV=k$5UHxM2rY3^@Ax73~mqp{H>KE)xesc
zfIGi`UXOz{a@;tg5zl{T!K8z6B(i^JXtABRg0aTwR`B5CzaLot*^5pZYWv&aaYAm<
z#qZ-iN|OCuvCbM1Sokh`E2rsWY)n{bXSN~NXZRU5#oep<Nk3tKley;A)Fv_S38H83
z<Xr?KHZG|-J{cQqhOOQ~MfZK9yK?f~^LQDxJIO{nfikgO;N*($A1{$4q=fBr`0d14
zrwlZK$AMi;iE)*ITBayM>#QS%g4=}s?0PE)9C#FEteO9Oajt|R{v%d;e91|JSJIHC
ztHFEydUtPI898GE4?IoOuFtJDnJ0TKEtw^$r<N_5G|e??Y}WOONpk)$Yc<yHT`&oD
zJfQTan^)U;tE_LkAIK`PH5eP7?c>;JJ58x}<fpxk=H|KUv7!ObY5Z0Ne0_!nG+Vvy
zL(cls_(qUz32yS`Te&*XggcG<L1xVEL8gp-x-Q9=<t$ZUE=h)~CdwV>Pc9Qkl|CD-
zZp)Le3%vj$8&fs!`OEZVO#_?~OZV6JU6d8(jokomEB9tUpIMEJiSPm^0;Yl%;^)i?
zR`V{aCFJ0lm4Tyzl!p^mXlzCk!x>EIS=1ZyMRxBp(`4LW{BhFkL3uJ!c7SD<4z>cu
z16#tg0WXMN?oVsq2uC?GV(|x+iKE*6e=5!YaHK)xPvWJp6$a&8@P4!oSF!0!L%~w+
zx6k&og~qhL7C4%ViJtg`0w2DK6DQ9!pV=bI?@@)-rJQ&c^+Y7KCW+DbDF&YJ5%*+s
zmx~KJg16`WrFRfPVaoyyI~S(4rmOuJEkgkpx`Br<Y`mW#*E8BrGuQPot6^?10=2%o
zggxk47HSrjfJMkyvQ;l$?)X_HpYor14(}qTA!dzmkukzm+Q#C7H*fDJQ^L<Ix~LvJ
zu*Do7LQ~wJNtU_p5DGUV=$%Rn1T&@3hC@#hOB^<L!_#(b&_jb=@TxA)RgNon&o76N
zi0iYA+%aj4>TzK6E(7|ecl=KJef=@WCs+`wcsR+@$~QI`2n$!G8cW5Ei%onR#FgY=
zXyP2UD&+0JK3C=hz#xY5)7J6r_g4ovFSGcCe1VBu(*mG9Yvsl`e>k<!v~}iJTF><_
z%l!$mH2y!Sr<#Cum#OD~lNA!Li)tcnu>TjQ_(@BV%u|<Be!%>f+L!>F;JL9{YleO4
z*|WWYSu>&`F@>q^OdOyfWC6@|rShIXaW#0abeY#gyZ0EPz2}(GsMgZG0aWW0%!uK7
zAGQ!9-+mJP?Ah$0gI>8g$Qu!xys%&nfHy`Ge=mnusB*dJj}A`4mRR)ofdlJx1?jt`
ziW-QHNG&hmStmlX*$wU9h9;=#kgWD<Po;KSSa`KD9Q4H>d1T^0N7C-;V)c0DvYt;6
zlumYKLEpF5YoVc`xdxxpkMbDif)*+~HupaS?6;Q_;T`Pm0{rb9H8d)&`K(OIto~7B
z^^=x;c?kgTgJ7$=#GD({XlRB;*)<LlECqq8ndvFZ&BJlZd3$-w-NTk6rSPhqlLGb3
z)Bv;_1V+a3VLwU$Q&ZjWDa2PmrNXFR#b4<Ni=4A1%EfHo|J^~+1qZ`#*x>|>*3Ru*
zZxzMA#Ysx<s~ULpVTols{}gRpwMarEvZ<X=U+;r*)4X-TVP@;C)0F!L2&2NCD|fIq
zD^OWDZP1gf_T8l2hR@|T^jL3!MOV@(%h}nnl@BvC&1)2>Kh(i7H8WsZP%<mP7Sjb5
z2GDS2bU91<bqF{|#?>_-+iylFRr@|cq^NEQ4{ZB4O{w=}UFz<d=UU0tx=V%K{PMZj
zl7L)93h_$%#QVHwQ<{~QD9JR)0(TNp!m!b^X(U#MwsU$cH#bcjm{saNlpBpU@AWo?
zTAm~OavOXXwu)+~tWUf6<lLw0>YK{e2ZD`uXqZv+!>aZ=#f)9CUELH(@}{Gsb_*@K
zHBq&#0614XO{QT_f+b6?t`4krgYSVt?4!0y7qjG;i{%-DE(A7OLY32cv?`ycg|Vr0
zP|rO**h(bu#Acz`xSMw)V>l_fOs0i|sy3nBl{!>bKR>PT1|C=vZB{1!SwI-K)o*a6
zpIWHH`}yiEHnV!fi|LXgBd8Jha-Qg$WADFju2(QSTwE37w@*ej1aqoGZPKyB93{Dx
z`y-NUr_AdoSdG|>=V=7=_m&5lc5OO=h4<uEb|1`-o7Aqq55XXt+6dR(*Lqn+wM=K!
z(e_z(h8fWLKs6dlkE*8q;oE8Gc9&XJm_u2-CskcTTro-1xG_3AP(+X7toen#Ub!9Z
zsp#uo$DWiv0rm?UvJNRH8eXxTc5b;kvAk?wQZ6R{n*^L@5RsNLbcF`5y1BQN%BrUt
z^vW}jRigC9FLx5dfm~TItaQ*dj+bw#J&HVWC=?ByK{`&huzwei7w9LP%XG>@rk~h&
zKhK@h?(ElxVN*maR@<yoby7-7<lJlzlhus}KS-ZoLqoax!{2Qt0`5SJX11pwX4n9a
zP@!vLw-9o8_}AAYtKAU+-mOtR@!F^!A2JurM8DvVD2hswGF6R`l;8*d2&9NyBt7_5
zFi3Re!UPAm-n7$@Z1~p1x1-!!zt<zS6$uXu3+wMkTSN*$V4i}&2CI{yoB&0#CKShX
z{&%K4384m%fKL$Sj9Y1mPymZ4Do*QOLsWhF4u%7FL<d9XIi5JTP)p1FZ^(!)6Hg>G
zEYEl<ac?HOSj8Ew&#Kp0AM=n`L9`l>K9~id>Bep!eDtQ9^Od5ei)X^aCa~l6RHU;)
zrR@nqwY1V2uWtD#wvXq*4#(G-V0twaxrAQ|DOjYY{rV;4l`i)3d>RsSS*^?ydk}h+
z8T#{{bP<7%f@4wrmzb-YJkX?*G_^Wxwbc}O6LsHBAq20`{+TsF<#TM1BK4^|>D9;J
z?i(RFbP0e_OQy}7$T5t*=^qa*@DB1HU&ECmqx<nb+k`ays`m!|n)&~}F)!O3r{S<V
zpKgaf1xKqdjIkydH*mGQ=Ck;3UPjozwMLGzD`28!a!?ZO%e{SDJtVizQ%(F)irDHp
zr8O^3tL@{$N}Ds?jj^(ZSy(HCoXC@@!R!}C(-+aqZG|<_BROjKeUCoQ<SOO49cg<b
z=D0u4gE^u>=j@*Lrfd?Tb*V?C>Bk+vwzAo+`^B_o&pNZjIk_eHq}hx$4b{N!Qeg0l
zXEsobRunlKokU}G0#!LgX%FkWs{&6=0@0Rl$vNWHTQX1)$!w<D;o{Tk6c<&7k{32D
z<eZucJCG$mfV(w=SnNv|Aah%rOr`IZGa6B@GgAFZ)pzlHey^<n-UDsrPQaG_^qqNo
zfr>z;i=*|0$^Wc|myW_K){E}P%%i}ztX>M&{#!&`_d<%xi4so^xf9!XRR~6&RJaY;
zL`I4*g*k6o_`L(kX9XFxr0o~qZo$}44iyyN_Vo=ND>8lQyzZ0YINPYT6SV5N0mq0^
z5nOEd(dJ$(=viU&=ZWmLYLv(8#hP_C_tOqKM=L$MXhvbeX?%H~k~wuariPt(+Rpv6
z9x+Hjs#70iZF578wv)QdeDo~TI;XuJYMa%)%7}UBR!5M=c-G#fmi+7^x_=;y;+`*^
z;p(6n$C;mJvXE(jet!S)OXt=Lxl2$H6X2J_VN#<O{vNGhXf`}lJJ(`aep=<$EY82c
zyfIVtx#{UD{)QKZ&rL5|t@f;7bgetL(A_$>K}1@y6}yiPs7CPd?m5^DYhAVB@U+ak
z8R`_zrggS^#>)idMUCfFoO!FpYK%woBTuUwn~8^@=nZu;M-mV=u?;}!JkioLoO5mE
zFx@Lp8_3eDMa0Cf(LzRR%8gkIO-es;7*{{LOiDsH3u)mQIR5%SH7I$Hy^Qi+tn96x
znLtXa)(W78bl4L1V&dMdPHgdRU(;J<$6!;KC&lSPh|tC-uq5zB;E88sjPH&q#B^rX
z*ypm=YLYFP9Z*NSJ}2kr>EHkot9D|e5=6#dKS5P&@$m5M%@&g(c%?=z-`Y3}Klm$%
z@V9+G^_-5;j3yGLkgq{!_701!zh#|eHNRwzHyhx5$<Lv!^yWc=1m=!XmUx-QHWgWZ
z18S`|r9HPmWW!Y5{+0LhWPTUznXMgc4i#}kS@ocIHv2)t)|-qaKXzYPH<rl~fNOO}
zXQRy$%OpVe+1|2hg`x}pbLuFyiuWnimoF?ps5!4hAkuUhM9VkssO`l>le5xApPMet
z2W=k{MZOca+iQ9dFW#`P)DkbwyL(u`Bp>@|N3?}wtjs{M<tcixIpt}RtZCn^IQV0M
zBHfC<p0gW?Im=$zbx^DRk5-?gN3!!(#Gjb_?!E00W>>kS#>!WHclM>Hez+az#{?Xp
z9sQ#Y?e$VlKsvR3<xLU>*kfk}H_&HG{nmwI!bbr1k8xm2Mk&#qEWAii>q<|;(wx(?
zTdSbX)wWXZ*2BW>NMUoW$x}#RbO1lX-<z7^R1bUjKlgX~gYsW^TbdW|?X$34*PnD#
zU-{G)7Zc0Akb!%XiBi9O@e%RrUxIWi3cVuvS92UZ$y@6L`#I(?{(YPG_C3vK&J`qy
z<99qdPq*5Upher!fhNqR<<ZB=q*kyk26o0Rk-68d%gw6wEL_|Tk}21^30HN~*+1D6
zSuZ{L=idSdJ7w~9hm4FiS<xJwd|!_oInQsaH9O;3m>BdjpnXX7DEUC#X)efgw-({Z
zR9e22Zxti`*Bk!n*KswTtEIC5cZAGDdLG=Pe<gP_Oa9)22izF?&zCsCv^D&b_YVv8
z?9_^9L^C_FF(T#Mf&{VODy`|OY<J_%zBPOT6J1X4+gtJ{rsDcp{>-mpv2`a0<!U{r
zX2$B)NFOJzhFT!kKHRb_p#{~ZXz+C}2KK71RTSj0k*o|3GTWY}X5nu2qn`?HfyGX~
z`-iTZUK91Yi8Xr7I~JC~H<Is=?Dp|d?ghj=r#_IN+5bL7%4#^KOedN&xU<}&e|aoU
zW_vm2@QCN6n<rAjb2_($NZ_LMp_T1Sw#X82BJ#YhWH3?6eKiJBo>wf`?SzVpZ(UP+
zN|G%B(lU+ne{%BWO~idS*g&pU?u^IH=g*%j=<9!Y8g|aHJoJ*bJ4WQzTnRB2@MB+_
zV8o^iC!J0_le!|}_{=0;OH|7s^$)jar*}rnbHzn0nl7{VCgqlH&zc2KACq-^uLQYy
z_u~Ca1*9wSY*}b}`V9)KW{%u1q^F#1dK?<cWVwcM&QeQX@6Ca=?om>**u=hpX%<;k
zcp$*af-aFsyUg!`06)wrU2SO^8f=}@#K$rC6-{0pFaGT=TFIVhiR$+zA3<I<Tu)|E
zcfNd!$6sPMlyFv}?}K-K@1O)m9E-Eu*ZNtPUP#anh|O^x+UxL;WDGbh#iI@(JTk7P
zRO>DGete&GGWZpQjbquCzns$ivAmB8R~%QuzJok}HiBw-VBP5Z>$}NnF&&Fq$MnLf
z#HuWOt+Uv=4cWK+5nGYJA6~P~J)K-0m`I#a*Vn{tnskcZ>&2EZr`FA9bG2vHk6uXO
z#*qFK68(}5L5M9{&7HidTJ&6vAAA=V69XE)5KCD6LM7m#`ZfDtS5d@|aelB8{_3Pu
zODsY~?W@53f4hKF1M{KaZ%SV;<0P8PG-BJhQw`qP%l&sbunv+<0O&*K7?r8{GZbsL
zr2L5p5B?u%R~Z)78m$#U6ckiS8bwI~X=zXq>F!jzo1s%g5e22Bk?xU>8A?UEW2j+3
zdWHcchGy=@<5AB!dY}6|_uocj&z^68Ypr*^?^^3ae`&ayE&pU<{%)v2`630Ep{-@l
z4sv9~pFhjK^MP7}-T6K9eu8B`Q;`IbfKs={lFC{U%}-UR!TS1c-fgItOBswtX_c|`
zr)!>8Om|G1botN~fgrieZ0t|Qj<0aM`?w{zMS<?k<O1R`IN|~~)^MRi0$e_8`YH{(
zy)arHJtx-43Ps@V$j8bFV{@uY-(UQDMF1Wq;F48;R+Yqs5N@;gr?A3bPYpwrXC)F@
zt0i_byb4^%mPf1d;ndCy9cSlU7llC*nQD7HWr7=l@J2tF5vw+4Qd_Gdmh(|Hy^!~0
zCW!o}vsSxu$N9PupYmaep|i^*F5JE9pwIGdm$rLIR<c)LVccx%eR2+Q#!l_zIJVf3
z`F{M~US06^v962W7sbir1m=5lpB-wm9n(bB7R$<?qbhOmetBx#OEpb6i`RQ7JQ<Fi
zcJJ`kBO8lF=A5>;+fzj1tpe*Mk1q*b@iVx)!E_jAOjnLwdVO^>vcXdXA|9zUFRbsf
z=uwKiXUd<ZQTkyKBjoN$OcR9TwT;>-2^+!d(RF6}3yd3F4Z0ULYd)9fx@>Es(R*zN
zo+5)B?!K1WdyFXVd5fs-jg1!X!JSSL?6#)IDQx!a%_v_*crRv64vAG4yKdzb-c?Y@
z2v2>f^WZOvG)Ut-X<O3ZC$XB-scd{~Co$%PBoe#5^}N+I6#_d8#~0lXhw?7ldNt3z
z=DM1w{8%aZAS^=wsIr%ni@Q+HYPS(O_j*(Z_R-Yc*lxwzue{mMPr~y_SXBPK^tiXt
zhl~;`V2;IaLZ8zNo=4Kymfl7vZhObe671&!ool1C>AltilUDtQThGsI)fsM%68rRT
zlaArwg_sRJyL8Z}1DE;UPZhL9_$gV`{tMmtH0tPMqvhr_+;9rnJJ!Q#pFitdI)@7w
ztx-wF!b`ADLxtT~)x;`*2NX`QjXD#aQc1bB!`8d^KtqxBj<H%!7BDag0Uvw?zwA*9
zG>>79wkUCG9Fk8NOmCc>mY$FKW@DUhC0lJ>Wv8H08=W82T?V?|Hd>~mlAf_^r&M_@
zMbf^Arv~@l#FmIL-jv6kNZ1bRE;_iQO=J92g8q7c|JMzq4s4E~n1qas`KH|2PTK1E
zQmMoC!MCE$&78OKzrBEScTOJ(ayv%`LW+_v0d)I2Kx+&s&bOL6aLbwOn~`I+-Dm~x
zITrfzE^617Ycu<>Ci3MuAaZ?=qPH4mY;@NTc*bg()k<u%U=EE{nj{J0D9h!chT6H_
zcx~~N<B;XTo5as5g%5wwo4ypcb#b%pQOTI>VmGbBcwVKkX=q8~!Le*DHWli@1TCQ#
zjGjgK9!v^S-s`@##}Ju&YkM_TpH0)vzHj<Cub){V@h+2c9_^34Ut@*WwGyGuPM&MC
zE5r_qNe>QH8-Chx&MfqMpw$AuC2QT7Fr^Xkzv~cSl2tTO(=GP71ZHsU9$l>=9}F6|
z*T@lzpXvv%$1dX^$blW~(REb;zN2?^Ydo_J!2lqI3Ve||=^Qge&@S)mRYJH4Pv3lR
zm(aGNdw5SGos*6ETe{{Xg||ic;%0XQh(o0N<Kz155>_B6bj4G{^Bvw$IUVCjS>*Ud
zt&Gw!Gu;Dny}zOdk50O2E<?VtM2JYXNaGb2TDmXVv(9H-J)IndNGf}E6~ET6FDc)+
z12qfZ&I^>(I{za;PCY{v?Ql&6Nb&X2QMx}252N5F4prPWfsJVMMUhD;$G|V+XFVBP
z%c|=q%t5!l^uC`FNyA&atSn!eG5NByfS9NC?~UC=(${qGD#g-u<c701q-P;V`Ej++
zCikYvsG=o35o)UKXAC`sLwRF;W^PU4!7C?g1kwc*sr>RP7YMV;63ZUv>S%g8C<|yk
z^=Y$87Yk^}TFb)2$Cn4<n5f<jg9{vQC1%c!{0twzUNM;9X_dfll$M@PbLO+1q8~ul
z72;r$g-RpHfF!*%b?fPWkoY&31hn__e!=cqRst9~?xelMXWyNdcOj{bn2@~ldb!_N
z!NrZv3^KKBH6d1mnn1D5MWY0U(0hb(uvZ+}s_=r1p=Uc;>W~GVZ4^EF*neRx9>}_g
z7;Y-vEC&40Ae+qZr;=Pc=NUB4&(^-p)Mvz3lzzBzK;SjE=e#k|rKeV{MRuYp01QZ$
z0Rpfg6$bEJN5VZOr34Oursm==jER5K@}lkCk||~^K3k47HkTf%=&>@z3?y@Jfaa0Q
zJTAD0t;{VXHzd=pU;ZWB{$sr$1Y5`vtr5pbPJmx~Gnm_}^AsrGkuA0%yIJ(`8c8~F
z{O9dL=IQ)L!J9o*_xh924KNu|1}P~i53j>kMB6|tDjU#A&X1SAIXdYTZnW*8{5`@4
zWSZ7jX;DBz%7GHFix=Ral54*rZCA+a>?~fPR43PrbM1=jhS>nwXte#wos#6cpZ}hl
z0*D=QE?s+zI>#tPqu)fzs|BDHN4`(mH<%wLXCQoL*Qm}+J&*twf62>P@2`OKA5l=U
z4)^ZejWG;RTo(0_DgJN{yl`^~tTh~7=DYpiBq(GM(Ul(XRd#OxJwd`W_V=V1gaF6f
zzWn|V^%G^qP4gFR+J-r(+h2l+UbELtG6rz&!SrySv&Z{F3aL5(+Adr5;CQ1i3bxW9
zE?5!G%DW(YRJazU+?Z^;(c`n|y1(8KUpyww@O{dFb{FTOk5rE<$gB%bh!ek?D#*O^
zzD4f_#yqD;Nc`_;@*{Rn-#zcPLV9BxWt^AkI$3lHZ~<?~!eqC(iHKJ@EHf~BlO=z7
zfUMqa677fx;akl@!8eOknm<8S_Ji`QSV<F&T^70?!T4Jghu$%&RXof)Xnt=*UB*4}
zH=2$L5ufnKz2p27u>NwzsT4#r&-T2btN_5_au^lqk9SE{h>3{eV#aS+HOW<-*q=R@
z81ru@h&8pas!Y$!P%Ji4^5Z<mZIenn_eArToBz6Hfrwvr7a+#8SX_TU8S2}AzH%N&
za&bSf{Xp&iv3uQXK!VO5BXsu9|K&Nb5t(rw{a#)>dAy*aOKk~c_%m1kw9OuzOMdN{
z;OC@YP*(>vk!4Ngbu1WHp#=U8SPrrdnlnG%-#@fop8E`8xa~(10nzr%k$aPfIPZGD
zlBAJvZe}I9RRf`u)Y}?;T@hV{o@9sY&_w~6Xm-Be1OD+kzv}z<>Eb%6huZQu%cUi<
zYTC);6QX}FR0ma@6XkylwVNzs9LoVR0l8>f{I(C$>~{6U%mK_2w{(*2k|M@yJ(u;o
zP#>2mhYb@CfLIm_hyp0S>fVxz=ld*S?7d#Uw|9UPD_=OerW*DptvF2t#F>DZx^1li
zu0uP{k1GNc+bZcn4h~LM%?$5P82-;I&&yXuj$I+!wc1P%XZnH%Doy3BHL43PcrE*R
z9Rjpm`_|e`o{e;%!R?aQ@=N8pe0;}PTF7;?eu+Z!6My|8LDd-W*IqNvMHHL-<0Cwe
zOixtS+6rIm+G=A#YgoZ^bij~)nru(9klALO;YM%AHjtU0b-~iYvkVfh7(IwR%;~i}
zzU3NltZwAL;J`{}q?6+(Vl18@p#6&G<!h9P5o>*kBm2!46tOS$L8&0Rd-P~#<(Hny
z!j-n1>W#<!ba}>LrGavJTSyUt)br?w%M1gR$bR?^XCPCay7p^b;SI;B$#&xKdRd^|
zNnDo7<3En-+qRYTAYU42s1-X(f>g#ghd9KMGxho?C=?YLHA>U@_$O-!1Yxw8-j<k;
zAz!-OTF4qI)|4(V)Ke<4QfsYRcE6{8;?d5E?iY}@#Va#499I|&);+%<fQC7Y+Dteg
zO0{$RHp}@0?Wa7yJ{o(p(GkFH8w8I}q~~4WjJa$o@dlu_qOP=rx6zQ~(I50^)WUEN
z5PsX2$1GlL_T1N|qH<h(H!)&A$t2&U7<>|i9MJtzPa{)q^vf6^82YHscXq!FQ@yt;
zAjr7i5ko1_0X4=dbzjO)dI+#F&n0z!D=h-{ppov<;zFMBAa-M*rh(nB``uEv$Aln#
z<E_2T`O3~z2{}^2==aw{vkaD<KuHL!o;_W<w~9EW;C-Oi!^UDpJ9=#qdlR65?*YB>
zCP~C6gdJkEhZD3mASa$Au6&jl+Fp(_ix1FQkG}x)o%)<f_eZl;tqxIcW*K&VR(of_
z3YjX?HKg2Lu;kh5pMd*$r2-x0C9XbxpUzw%B{f4uV0Wo?ZE!G()FW}r3pvf|#FK1*
zzCPx6qKSJE#=oW9;g~!sjS>bzNB(Apz#lvf!`|JE-J(mEy(B*ngpE%_2lI^31!b1T
zG3ANV^(M!co(o-$Hqm!s2O@NwOjpDq06HIqV3puNmt8v$R?H4c>U)?(AlT`d%+>9f
zo=lU|ym8UYKtIjH35(Awp-iDY*FgvC>(^J!$<<C+$!&{@A4vKio;Aqs#w7}hcx3s@
z9jnV)dFft#b`><Q#4Rk+7OQ;(94A<ZySfe|Sg6bb=2DG#e7kPc7L8=PM7LiAXV;uh
z1#wFq==)SitljpxPLy2DQG0Q}_!I}1sDIc{o?bKOCgTy#T)J}OHL*Wz@<0MBo22<d
z!M5c=OS?dL{9uu0WrlTm|3Ye{u;+c`+ZbM|`&g5XTbu7kW_hr%n*x?cWG_+s1BT)5
zY}&UN#YC#(N_!Euu6?$3)EsEA&8@WnOtBS`s=oa_T(oJ#Yu!FPg{k0@-t9)kvCR$r
zwD7Y#!?VWGFR+=`2<C!z8ynoWQfX8i$UmTtsw+#b_e3ZK78of7zp_)h=~L7e=pNnZ
zMYK;$trk38I9>eWB)|i8B<6d&#n(1EqI~AR&diztJ@!>cO$|BV1-jGw<w6L<6v##+
zvll}zEuH2>rOjiyRqszZkc3_DYqjVFqAO&y9MzLzUGNOmdK(X+=9!+=?J7kOUINzI
zYk4_n<2yO%VHj|srn21716c!lKOOxeM_E!gXUkSJ0cEZ2Wej-W>KQ)#jv5XJ2102r
zwtml-e?%Py6z(^cPPUb;+<}3Hbr4rBEX+&~vAkUKAzayfqTN&+RGMw%m>&fRJi!e^
zV~eU`uO9Yaf42&>e3Z=EA4zIdC|XDJS^{F^!I-^8T3C`Bx7X$ZX5>b*KuWg%%I<UA
z2~q8eNYPqgj_gLAFVKzs8~`7o1mx?>HAW&|M?E*pifF>c*#IT@h71v`)@#M0AOHJY
zQWA^4gFtDr-y?x_`XJ%G57NfEn!zC<_P2#ATiL0|6Vsn#)T=Z~aVOsB>sC6HL+veA
znKk)fhr>)S73HKumTrcJW@~gW9(e)Eao@48xYH@E8<O;a6tU{REx-k)iCSpY3PdUQ
z`Z;VXuI`^VO?3j7U-HBey&<q(!fN^CKe*yM6mYr>R8BOW<Ggb3_rVu<1?NNozYqjc
z6k`1S7X<Ld3!swhNZRT3)$4!o9f6Y50Ma3b+ICR>u@u$-7e|OIn49PRA3mmu;8W}K
z6+=HJO{zz+E)P@xuj(U1G>O^#kh;sQf%{0mTd-FbF0)>mGqo?c7Ud2E_I6?0SASfc
zz(9SVlsGB)KU&#Oe^jOvNGhmn+5P(GKmB?&eVt?!Ks(UgedZ0I3+;sUV<k7zaX=0{
zD6t>Z0A@m(k5_NbM_aoB?Wb338)jn%Q2SMCWuB^IHSW2h{9nXQ7&u?dJh2OMi_*)=
zgt$FgBp$l+RYAyeH*c%9w0_dv){>nE)ry!0<}W>L#@U;Lcm+IU9c+z~IcWb9pw+pE
z=!(yfCTRBq8F1N`*txwv&oeo9;zW{(4t{;qpU{qBf*?6U3@GB~e$ex&rQIi!KHie9
z4#?UjnE|uXix!f=vYH|i*iMQyrcNSkl_?m?`_&f0#l)m_5@Fe44sRYs?g$1Q2=KTK
z;qy3m(41fyuZ<x6xtGK_FQ!iQ{Vm|(5j<(5<`TaJOd`2ijlgEujW@_V5EoMcW)mp`
z0$$q`>HCYjmoJ?N>R)(VuKXUsM)l)#YCFcH_*|dglUX~#nPb`s12d~16;imB$1Ck_
zk#)qDP3Q>}vb$2;isQBHsX;~1dZblYsp;=e4K+JZV|Hl=N;FB9fqE*9d>y5@BS^nt
zwe`nD9?xk-7q1YiflOCwh+miMPTly<l8M8G`<+!Omsj_185{yzsCE~8&GMh1@JH-O
zV0h295jaQK4vZ^eovGZhw-7k`9t>BHhB-Mb2VpgW$%bu7%+-bi6?c?aqa-4i6nucA
z2%Z%-lyf;6!MyE?cJ`ITrnoo(Go*lRKd$)08c5^t7ivWTGaqgO+QfN|w^(Q<IWgCD
z!;;#&0Ma>z`T_Co#yEM5mDt`egRJ4voR2BEks#{JXJhb%+248<pnj80CEu6wJTOQ}
z`9gq<KL9}@XtA}o3~QUkaxmKbaXS(RT3W^MTh2_%M;OD)Rj(j8gy*stnl9{<29Ex=
z;OP|3m^uRs%fn;A!_!`yAzX7WcXH)7nc`*l%5L5O(P`^!S0yPL?Jil>&aXalNCyVj
zjF}JewOk>!TWC)!&Rc7ilI_j)KxU7!l5w&HrO(?6w!(^yzGhHuwFr#DUvpDTL{U1j
z;l^>0B#5C5R^MyzW&bC;qNtQ?RgzL`JeKw&xFC*qQgzEZRC62B7_k3@=0(Kj-ZbDP
zUi;37-|wp5$|Lsee$5yLcLlV*3iNS0RLJE$U%oNiP#*>KN5$R6BYYOkK2;x$hYLOT
zldd8pO8RbF)7X;%+TFw0)~`EVZ~b@)KaTF)lz(z}EL`n%j1@4^XkMo9dPYD#3&(h-
z@wLmvyU!m|-+XebUHmEKL4<Dih6G}bAEr?Lw*M-~!f0`)#<j6?FLh0*7fjcf&?~m%
z`Y#hU&}Mj_n1oX(6ue*wi6?&LEB*ESa{|G=g#crSG)LlydQ#jg-3{HHDxBTRv3JWo
zykye*0OKN5l$QRz-*v(ZBDx2aFQO(Usxu037|S0bS*}~{;Iu*H<WDNvP!Zs?04Zw2
z9a8H*WQiSFl9}F>X7w!Rk}n1wYSZ4K24yP#{<FT}z4t(a$0qUVAHn~pU;67R$dy4d
z@R%muRsPR60q)@V@4x$bL*Ncg*Gm^qrRe`2L^V+om{~mZfl;8p$Zg+P3(PRQXOAlf
z#5lk646cN~|5O&H(}E!LV{t=a|Gd77b=wKRn)K;u+>$-5HW`y8u=9HZJ9(z*EP#hT
zX8qwZ{$t)wM7R%~U5WZ?Z~i{I#t;Q1;YSYNq(}uzclr-b9hKJQEZ#&8*Jm=-A3YZU
zURfao)OQAC431)ud39heQ9Q!F)-_UP*lz<g0jE+4s9Z#jyvK)tu%{J0?VoCu9IKyf
zltv@N;Y@kRz-cM2-tberOT7f_FMq#YlDe04fGR55<VHsm$Co}UoZZO~9T0JVk+vJ;
zAvnV4Ap#54k`Fk9(;*QenS5LA?MQEDc10hwY-p$!pS!V&noIb=z<~SOM=rKLgz}&f
zZGMBc`@l19M;04IP+8Nbt34_}4rY0-)%-t;^Ud$7h~qwZc{NX~n`$(~e#9!s+i1k-
z$Olxu!?iY|1k8X^MX-z5HoqY&Fyc2LhP#INm3UOLAqPNtTF1w^zP`Q{>}(HA>1sQj
zOCD(Q+iJ*k2JGDv+^+8u{~Wo&47|ugG1muwD2lRVnMN%ZcXg>#;P39Sd1J2~G&LNB
zDKa#^-Zay(in{*cUIiEC8ML1E-KaL=Xvk}7aPVHcg@;)7A<zuP)!3W*`VYj!WjxWz
z&0Z?HD_7Eu{U?^}xUX^Ra^pv`bkXc!HeRQJt8<j-Sc!>=EolWhswI!Hi5=C1nZFy0
zePUGMH=LXyapQHSk?p#TYON%1wq}a<Fx!iQw!osHCMLKP65f5vGCSBl>J6}J-_Vq<
z!;kHZ173RP$fg<J9~M0r$6>+9VRb|~-!EyCZ|+rt#a9?Brh(tof1~DIf!)BQ&U`Ti
zeSh8}VmP9gT%=b*&?TzAJ92|{y2@h^(-rp;71P@v-<Ow3%oN*~Qqx^~NvHflBG$d?
zlj&jx$t1~7gZ*{|Sgq6iu!GGol4kO)e%xD0vl_?e@In^Bj>9ji%QcK^K95RVQM^(c
z#9Ru)kMpD7yeT*n_Nu_<zE0}0-Ip&UG2Yj^0lEH7QW9g0@h!oe*J_NHNJujK`c$|>
z8iqu9g#R$dvr5wGV!L;()mIEgZM&^w55vjQCxLKHW-$i*W+!aZ;`aeL#`|<N=RNtO
zt5;9q#GQKaX=Ckwbx8kzm-hd^gUSreWb_|=;wc=<8&Qu+_Qg4iAHu4(HdU<fs;?W~
zGsl%XVL4@4T#5hS7s*ev-)yU?mkn3yzUAg4pNRXmu7y6`xYFST+u6NBe>5_5^+lmp
zS6iEk9P<3$z9+<nqSLJLp|d&@J$uOSy=G83$NCI2VhJ)|*o(0bA37$PMT{UsFr>>=
zg&Q5iNkgmQQo>h2&^DM$bHa%n?<!LAr}V}^ZW`4hfm-i>a(yS?E4nUmyGI9a?|^(y
zkMoWz;JcG+X|@00^(ZMuA=p4BsKCdr+t^=Sra@*;SJzOkF~_h>8mGqL*Bjfl8&}|?
zi)xzrG&zl+2g}dDt}S)Qa}8|uk}u!at9uHcP@XbM^<Hc=Mz)pC*3H+^)h&(~c|h<g
zx3{Y$mJiafD(_{BH-|IQhH9h~FE$(r8;;gJXQ~(LY&c$~nBva|;kb9lrc{{j7!4O_
zX#%cGL5nI0@>BwR-~bsZX=Jx)q@&-713X6l=$SBnhcA4J0j?GB&k?mR_)S-@)828@
z;4jqdj}ifE#uF9CBqtZNuXT?cwy=|4p-K0zhb-3>3U{`)vd5ptSK<P|-5|B<pMw26
z$;9(nh=cVP1Y0|U71}ZdYnmy1kGrVD$3YD7pM%%5tOk<OMuS$WjcDE(`e0Dq`Dqi|
zZ{Da??yVW>Ra+;07Wb|`KM5LKI=^O%?82jRX|41=Y);xj9=KMxf+`Ff)2oQ+!UsQO
zv(KYY#qG0U`ARQsysbMF4(g`Fgh$uTl%U_u>|9i76<B12K9=@_9BLozKJ6B%utAe|
zV=<OK0r1a_ic@=hA~mUY8@-Oe9ITPBGqwVD!>(ZeEc~?wrNP_?wMN^zZ@xV7JXjev
zzOB#yXz0lM%ex6a*Qsi~!;g*_A%9XsVBhz5*WmoiO@05dBdDoBqRL8JBcg;q$g6Ou
zL{+n!YPf&H__&RKUwMH;Ye&f*Ej-e$gx0s2DT|d;RtAlv@L^87+rbp7x7V!csSlP?
zkPa5JJ1Zkb7m-7Ki4xeV$h4B;7o1iFmzhpE)+45{E+V@Dw>YJ?6{)qX2h!ult89v>
zhOZBOhU!c04Ut&u?GLV5kV7+#N5Y5$4r>)@x$G>AqmRP)<)_#h8X5}JdDR5pI$qS_
zT`!(>Z7|9)iZ`~iSU+H8>C;9}2n>FiZ?tSoMqfV{^(E(=j>hl#&!7#)OSH3ZS+Ph+
znlL_kdHE5}Fl*0I(Mv<sQV-#`z`Qq0TD44X{*^8Mo-@pXzEIj_lcF)C?#(8I{)mE4
zA!#-T`!k2wtT{6g^vn8abl7%Kux1hEAz`DwwFO)vD=anf7V2ZzFscgq&S&4e5It%)
zSoYds39cDJpJYY9)L)r$Y>Ldz&K^71T;LM4Kl%dY6cznc+x?&)kVPoEUyT1DdMV+4
zQ1HGg5iR$*qknZ~Ms+%mJ&D2Hk+=MC#9z#1c!FZ~A+am+$wYp)Xcd7CdnB@!NVj&O
zJUrJD&v3G&)}s5MM||kLk)ol!+Kg}MG`>B&RHir))?0w|tuCSDKlF(Hl9L7`!#VMY
zV-0nMy3|&B=vc{SM8X~->yf>spFocRz#LdQw8sA70iWWP1;hRh%;p?@b!K}72AQk<
zwZ&6O$FU=h;|^S(>wFv|T>D21VwGK|5f$dyRl2q=ZThTsMHPZtwM)Sk`!4;Fbp;Sc
z8NcdR6ia}kn^^V^#yN7`UI-&Y@ddPVVXjwEB=QMwqqGE<9iR$w)Bz&yJ)7WwTc~^Z
zD7t_|=r+CoexrfCQ@RsQS2|ZRN#(!tjqg9EC!g0jy<BbNdp|QZK$}-w-!5`0Q58Gc
zCWWa=N(!5tq$!`lCtAw?=<Hm#d|axOEwCsktlR6wg19IMs=iw_&jhMg-z-s{n({((
z_(5z}J`rz!9;&Sa0A=|(w7ZREFh;9(>kHw^&SxeWzb6S_15}f8Fb&&`2g%34kbA2Y
zdG+nln(CEJ+HpSKe5aD~>NILEa4Pqr_ZGdwn!dzY_;}~HJux}rFrGSA=A4pYD2={;
z{yx0|o0=b6JZBKUZPuP~S_g$zowu^%&pM{RzTX|!Aa<~adZCu3znFGi0{}f38*2-<
zD<*P0rE|%}>B_TY0}H%T2yD{YJprh(5~}s_Zg6t*I&{30JXFkc_igZ*Gjp@EZTxB4
z^&e<N2CIrl1;Zjgqul&B@`$wn?qBB1+}a|eG5wl>rn;J%JKt)}hkGH)Nyp5v78(@=
zg$UqSot6vxcG#;$n$L)7#c~Q_Kh8Udm$KOirAB3Gl&jpeaxzin$`=29hCr&8{yH*@
zYLB$cg<G?eolL<<QNI?+$QHWbUeMOsQ^5fot0m%OG)HX#Y1+34B=hYBdnScCys%B)
zgJ+7;FiQq(uuAOCnQ<IhSJqSLaX+m0@fdpMgMSBmaf@>INfkQWV^yCz#BnY2z_D?p
zM{Ij{w>kw@ZKbCTj-bwwd$fl=)6;)R<PRI&Do7BuOIF88cG<}s;5eAmlrr~+`L}>E
zxw;&88xN0jp8{E|m~f@~CX;(guj5h*EMErZ(Q`CCeagpYVe;mQoJij5-*Vj1I^AkC
zfLiPBA8lQA88QDui~?S1&<YNg^Q#&rPHwNNQs-_+9lx0$f2Mqx7UueScCDiXQ;nIK
z<^sda@J+HSbLaZF4C>Ti#>2zD*^Kv3A2-y^@6#nhOlpfn75|rWt2VBfob{t)6WLkO
z?rLVV(E(e$Do^vPSFnX)iwiEltJ^5U6lm6m`}yxo9Tlk7mz6ubY&+*8a_pHwkd)Nn
z#ZCSD<pL6(c>r=9Nzl38eO&f5(%-OHpReyG1wSYi-Je~oduQZcM_1n)dYILC5#Ih5
z+;S!6&U0DT4UBN*o6`9BkYPQ(5rRJa{pOV?w?e4JQz<Ppmd%~aE}Z{Y919A@^A&|F
zPKMxK#AA0R2Y%aT9N6Y8kTf@^L6RJxNJ+WuZGL#0BuZjC;C9H`5~|1x)?Gie`8lj|
z2p8!{->2^1FeRs`=$xvos90WI>?WvHswJ8eV}3e#(|m84N8q1$F(?>!KymV;f>qYH
zL^%01P>3IGrzM<-pXM#yE}NgYKOWuD!Ic(Y061Vxg2KGHIkN+|ZX3-CpS9DL!(#p1
z7!Q*`Q~u?eCnP<tb}?c>pGFM}C|>Jf&VGLOmk~QT2f1xh)`lf0zkU>VB-`5-Wf`g-
z&29iix&2ix5yZn7MY&i~^JL%LLiECoYRNz_;Zx#bcY49q4)|E`t05lN{4+Gs)`SJ0
zg>dEcn((lM?AAD}tU}V&M3lLFo7z)ZrNk`FSvjRfbQWj*hM(O=hmV7Se0zA_aDq*=
zBICH)EmXj6?A_&i4izFk((xXNo7<qeIvM=EgT19$p%+8dmdqAC@rRkB%phomLq$o!
z&R3yYLBYR5Z@MU<-RSFr^i#xWkr)gAwKwDZ!m%#zMIpDpkI0o%V-VUTrzJbR|AueT
z=t0x|ykif~a@*Grt`T%PyLFs;ydM?hWM##!8Pof(x%bPsJ%C@AbQ}d7R)xKG#YIoJ
zVP<mFJF<|iqebx80e!boUi?(@vgl4*RMJq6G^0;|B%-qdXi+vCaeP~kC{m@p*s!S~
zN3$qx7b6?Qm@uqi56q}`vAghnT8TEf(+fBQXwAfn=`$NEqweZ+TEkk%)?ekPK|HKZ
zS@VeNrH%={6&E`2a8s{ht5)mw<c7iOA6L5p6wkLbLs4<qDM-+DJcDDHODKKGNgrbE
zW6}B9J={rluPD*F0hp!v5PBv04XN%;13J3e%|hLvW76n=1~nZgqO7G5Ss~0At3Hm5
znxIYYL`E4u%Y(l<jRIc~nk7O7N+GY<p-&qhSJB6VB=JI}g$D@hJYmb<=a-nvrF&Qq
zhkEjJsGB~%ErfUdqahzh*Qm90niM_NTURH}1{kzcvL3rZVgoBEg%8d$&9<)AbK7Wu
zAEcJbQqU8@55gpWc<z9vxoTUEEh=)VtlN0=+R{=%$9t)>_s7WcE>&u~_(th^;J3GZ
z-=^F_b8~#OqtOfq6E10c@uEJPV}fF&e7F>akCGeyiJ7La5PGgoaSSs{8e`Jj_UPWl
z9(x~l(%ZDGQEL`@KVb{i*1%4|-K48u@ExGcnXDR&8CPjd7|}0;vSu=k2O-gP>Vb`5
z@x=Ov^gW5-w;?iS=BTTExk;<kqpipF+bG9?fgK6#yJ?01zse<Meg_rx5XVL(iYkW;
zS;yY837`E?kR~^a3d~?k7Of{RQ1g6{@2I<f-(w8CZ?bSmL!2W8r}ADXg>=;=<R>0Z
z*>xL3?_O&@jkMhNhHIwG&(6kXii?y0wBZzg>X~|$7Mz){uWk665AWXnyGZW}*)BJy
z^vi&C?Ur51l(sIF9qHo@kKQAW)+=aI!M|p*0GNXYsd<le-mdFHwJ$J9;C0TmNaeDS
z4afTRXOrEDu>cg235yioK+y99?TD$CD#Z+uFt1dy7yEwI0(!nPQfWMrY%~hZU_MSm
zg*M~wON2pI`5|FnDx8+(;Nt1IOp)|Cp@`xkd36gLj8AV~wJyq~;CvHtO;uQ;KwdUm
z>^)LPi#~nSW4#1%y<}KZt@y&sM;ZRdU8IH{veXBuBt~f^-TENw4|xY@Vw}&4i8kW6
z>x08^+4I(CYDbB2wm318OG?Wmqe)TkJj+*4{aq$0r<GoZY}+r-drni1!NjhhYpK7-
z^{YWO8lX_dAW2H{UDH7Uba;4w63K(N3_ef925er3Vzm+V@*?&Vl-Y_*CjQgY+&<(-
zO1zAX<$B}xhHg^oFKc?a52gqfcIwdQX<x`YEu=U;4&Jm^&Z!yOg(>B_-Os+H3*Opt
zgYPWKBOl`&gC^Z##eDo<B@AOfg;bAAd7*+a_DR-XHxNKu__l#<$f<f;F=B6Z0z9oM
zQDqX_?D!9<ga0|><~XJ4SvXxC62ad*e6FSf;K%Ui7E$O6JrtGYBX*d`#an8ySH+rz
z51xp!d$MfKt?UO>0m)t4PFhT|U2gr};vN)}tB{nNx1`hUy-3LmK`0)W5|0lrbQ?QZ
z%wfMkZ+rv%Pi+B-5mdY=<@lE!{kwReNg5Xk%P;wqMXF0!p(iwt)tyufZz0KNKK2@&
zl5o4zL)6nKxa=$fhPtn>p;f&`y4Os!3X-8cTnAn+r}mcE{q|RN7RTSdyVmS6Y)MzI
zu1Gb~gW`Er`!*2rVmoa0PXsRKwZ1$fMW3_vl}kvVGot%(iA3VzYVOE&(MGR(#d_5(
zw@9fo^Zf9;g5MLS9AP$Gx?a9|C6@l=lbMgy=r9WFk}3gz8ar-*ep)OiFJC=4Wd?SH
zJ1(_pZ>>)!Y1G@8x6mtBtX(Eua?JhYb-W#E&Y@##c#zw!kDbZIcKQoVaNcOT;7#Fu
z)$Q;7^{@3YkZIB*#e5BO=wDhyz?e7V@-)l$f%M(g<GuUlSsP27Wo`Ijo?nqqd{vKY
z&}sku-gN2}+hP=IIGj&t=}gKZd~$Mb)x+kvVyyCo`y^ZS7P_RAL}G|qeG%CZR6kri
zi-&PWwFbAaqtWp>UAGB4d)`+wDo`c0xEl4lGOi8taw%#?9)Cp7jzmgTV_Y1{XY_C!
zYz6Im?Z)2XarQ2lJ{N!MCXkeicD=R!Qul_(7gd+OwAt1s^uJ(wPf%b=$P`v1@GFg5
zkbbIpP}};XwPDW&2O;cm)Oz6cY-pkFFqN1JffyML*J0?Z#1Vz9tT|F$shuF=SFm|Q
z&H+U~E>{85?DDTWo3A4y<2Q9ip7Q&MbBar8GJYr{BHM?{IlJy!K|kCLjv=ezT4`Zr
z%@?Ku8O*$E-dJB}Sur1CzZUfN?bXnyPs1s<ZsPXXca$_epXElbtO5?c?Yo2jxiQ;h
z((h&@ykDSy-K$H)=tPcpon3aM@QEB=c}*c%axeHv=-GKt2hpT{^8qjG$OV7)CV!@u
zi3xc?x#^u#D`Axa;PTC9-NhX_P2JWaZrHP}+Rr-{&Qbnnn3NacCw_?R`RkHt)kHGA
z1x;ewDZ>kdHixT8-i0<~I0!sS=aTgk8OTxk57G?FB2K))P)0K0?ZYpm46-`THUs3p
zPc}~MAQTvSVEmA!;HTJM^GeeDX~nWh0sP%X?tHy+F6poN3k?6<ku1`Hn#w9o{G#M1
z<8}wvqcxkANnG*RXm#B4wyrGG9b^8yV4V6(|KtYcA_2fSuvxV9^9-aW*&tyu_BL<b
z>;*5W4!$)944uS3wtmn6BS7wNiL!hDq<*6!Pc`rPn%wz8_s2*DDe#Ih3YhD^en9?(
zl9db8_<!U*1vlaC5|Ms|SzDQ*4q*NHYoR6jlVrqn;yL`Nsyv@~j}|x|I`%a14$Y^p
z@jP*5zXYi-%lNV;UY#<%`fu=`eCBkTed<QB2a!V;$7NupCP3)Hb#6ziBOaWs?4{CJ
z9qsdd@mKi;*$Q!<H*8nhSH|we{kzv&a_>qW45rvHZ`&4eg6LVafDdo!kTw1Hr~T#2
zRN`<Q=tQ-4CMV@3iVSkQ1J(v=WY)CIVf>M%%B38bLh*B2U-kdTk<G0KV*H%tdW
z#ZbD^MdsEteUGJ3!5q{_?KBrU2(~}o4aQM_0w)i~dlv%0l&<8qYsWe#7~|vFZS6vf
zixtR7j>J3WIaO<~M$F?YhF|(|X_kL>PeH#|6~>#Cx2#^~|HH;}!r%dfOUXNBB`rK<
z;_W5hcfNjpd^6k2dJz9YrieOYWl^^zi=G3m7XiE7t6czxxHS+%S+|Pp*`Gsm*Jb$;
zS~5p6->}`TEAH{LHJvqvxE*?C_^l+hNZ78tP<NMxd3b{cOp#7fg!Vj(ptYC%w8Teg
z-fu-QC!C?#u*yXrq^3xES|+!LTnZqb$io>kKmK89-{7b-MF44=Fb=xCJs-#ec*MR=
z7ZVU|ltcO(?0Gqijv*zhQ>x{SdO1c5N5>Q3%HEqda+X3U5u^o>H2<$De(3ZG?}=@F
zzI@BZMj`uD8_mc*Y5PPZz$XhE4#_$C?Ped3H|W1a4zp=YXt*c>Z@Og9VCjczP+7(0
z;TfD?U3k4#GsvsF<T-vCPvcy4SQ4mVo51x$U3^j!WU>p(^2_rlD={=-hinMHMc>74
z<C0JWK)3M$(RXU9icwG7Xk^L@%0jJQh3@AU<7#(rPp!vyo9<H;6Wn85tkvp9qNd+@
zlB9#^9%quCs9&MPFFAjxU*?L_OV3%lBd9Wx3LCQ-dJBY}f_6V~azrGgn=)g@Te&XO
z>K3z4MP(|D^u#^M-si-`78x!ch*etV2PZ|=2V7^A9n3tNuh{m|hTJo|oaoo|ne+hz
z-;L1{OjjnTK=$K!EvI#V#jD4UiRXnKXxH;poIZcG(PTrd(Y`|XlnY)Wr^_PLPs`O#
z^J6)9RMi;>I2NAuOO!v_LR1p*#08o+c1gIsh^~oqXq9KsY49T~kfi?A*P)3^_bt>?
z-O(WyQk=Hr#}|i<dQ@{iv{UZL6GJn)TjLk*v=aTYeWfp-BiPcO(0aLjh@waBF|TQ7
zzYiZTv#-|uw9R!9I^$C5ICvMx#7r8s2`&ym9UDLbh>9Ra+U~M6SYC_QcBNCh9IZIj
zqG*6_?nvTxi<-ODpaJc8YOXQNPWAEQj5;@~;t5OfLU7}etQf>FE3@^=7Mi$lD98NX
zFxU|8B}SdC4Di^?J{DV7|0V!f)Nx=Ml;GX1tA3Y{G)kP0Jk076_ExhC8B?8e8ov(Z
zgDDp4nrNKj!|sZ1_I&huv?OV0V$l;&7oQ;GHRgQb@B2ssoMG=vsCF-uPg8847-V#N
zX~YQQvD0JE-#Yh>{*D{89`R0**OFGEylG{6;=pR@VQaNG#Ldb&&6T&YMQ^@`8mzT-
ze2?mI_r4FJ7dGL0&MA%4YD&elJL1SWT6~D4x4|^6+-0D;F+Cp;76;gW3X4;hV}es<
z=x>?CX|pa#7~ayEZep{ZnCuGvi^@0&!33o5Ejj|Ai>U^z4ms$C;T}PHoofbPt?|iL
z4V7u{t!b!)J`<goa5)t51DTZB>{jvw`8f`+iy158c;5**(50wvL@^m&^(JI>D3kR3
zOL?8XfQQ_6SI-u(^yH3@B>(_OFtbc@p_DJhLC%|jNb-FQoQ?Q3SVrX?ilXYVoLs@H
z&$QZ}R_{ecm5Os?tQ*E^xfY@TbTNlU#}?KC5HLHT#V91?&^4u;C@b~h;Ooq<79RfZ
zC*jK3Ev4gPo_Ca{jL6Ko=W=6=e&UYo9m)aMzuA@aF2Wnxt*oC4pCa)L?C>YGY4`4@
zNfX2D`EwGQ>McvY;(KXn%?6fiVDj>6D*W{P*Eyt_9aVPZBIgP^f4s?4K?Ce2iqr3n
z7VC7Hcb>_;tF4<^SjcH(+uP|y^p=XNl@Kb^t#1^WjJw|wPO5)3TtZfn8$){%+7@+r
zuKaiy1%iqK&lIXXP-QFm47{&LHemOYlIDg@lr2=VJ}KWOTiy7Y5opEr$j!5W^CEq!
zExBLG3!$I2RVp^P<4-Z{{C?S)=SXijArCI&4S%8e#hw-=8JfDnh2LD6hq~qrpwSgw
zq>H7&0*BoZ-i2B0mFYIg|D5O9^Bk~l*AG34KcQsViC>IKT|SqtALM2$QOLKxR<!L5
zoHg;>N~CoOEMqbD;`39Vk@7;{Qmib8*tCvGr|>#NsF8UYl+yN)ht=V6#m0K8M>vhL
zpv>w}M3<W@m;_UqKsn+^JApav|Iakx=E+>!D`yuye!5v9yeYVQ$oUQti_Za9w$o_?
z8*?#s5VeV@<8VcI2BqrG#6jw6LtO#VN_S05$m22C3^<UC?JE-t7IjRi3v^0F^l8SX
zd8`JI@#^)<Qm-7*6E||9h8y<_3AT&oi$4CdlSD=BZ>U|PBnt-{Ofm19sMXSdyePMV
zGzY3`b8*hhn@>i36XRb5T_5{<7Qm^XH>3{*UbN&`++b_NoxE9%<LGyt7szbG=H`Jj
zOCl!;ZOJ^jnZGY<`a_)U{X{(W3DV^&Z1FrnZBDJ9o~e!4*S!erB>eR06AK@oMprs$
zPevO5=Zo+a4K}MMP%m;5AQLO3$YFE;CwOyZ0~GJ%4c8xA{h#_e%bUR1HWSe0a?1N}
z9UtwJE^}%;NWTwgLh<K#1;5h_Klk^ytDwi1-}BHNzqv-sb=UTcXt+v3!MT{^K%aJa
zxX4c=`_C76uI5dXIgEHv`-Oj3e>NWCD5J;k>B5~jgp!6^Q6h+P$#)y-{NzD|-#rwN
z(5zi~Wk~|`)Labt1I(68hZ35RbB9FD^KTEo-bXg}E-#0zy$OSaIn)IdX@L=XQ>@A&
z{(vqW_l)4@p854k#R%^NwHc0Ak7TR`3<AS9-0!71Q1QG60}##b%^%8tdQ(=%p$eeY
zS0m-o5(D7~!2+%AbD9L;ykkKA4R@eeJ|Tq$(qL>&tulo1#Tr&Ek}rL&(m8AMLQhI}
zP*Kj_tN8~SkOyRtWx?ETfM05KK*Jcp`edlmXYa!cR5|We+CD&j7*QmhUd+Tscx!ep
z)Uyc+vU8~(w@|@O=%%Jq5`N{omYT&hIjY2ncd>;6RwEShWaL+h@K;wd9pWoWsb!Uv
zhUa!HY>&T?5u7Jdk(hM__$0KY+E7*1+GOF^Xlx;3#7?VT@AZjpjGW3&@k;{~+-gMi
zTEs|i4p(vFFl?9kY<>=<CE$CFyip_^dizOl)YXPEti{Ck`ie2ToAlPJyui8Q_8%_)
z?X~{7uOyquzZE<2HEiajCf-K+`|a<6_U+LY2b2FTZW@ArJgIK#um*R(5WKnlWHrDp
zV}O@ut4BT(DNC|~AR#OKC_{UGhnOy$_w<ahX1<zq{cW3P08f#Vn;HOfVEd@xgLrQq
zrlv-_Q$_SZnqFh&-A6AtTWcv$=`~EPF5%ih?!ROEYEv3sinDR13!f5Dl@f^gA1nFP
z@|T3lYqK#lX@kWN=t<^E3MRtD3+*90%h(4~w!Xcv(dYDRn{z|{>`o0MT|L#Qtt7<o
zoP^DW;V<<5<!kCORkzlph#RfM71jrb+8N!wU>;QB3m5Ij>joWccT$fqVW;52hUw2u
z&$4rhtF9gH$RA9$x;9jXY=VyvxBb>TNNl`}{Ah2_ve-O2>!;e1_nR|FPE2WDZhBc}
zm9IRPden4v<SS0GPJMG~WrImOfhL%$By6>|SzH^UN9MjPqQM@3RW}r_s*0TVDQ@3v
zutquL&Zg^oqk1_+$ST*;rv#e4Y~!jl3#6ZlpmeAy4Eu(<m>PS;M}V4%6Uy2nQPRaM
zHhJ5{z|5lvx`Bln10<*+TE`$5gO(s`lkhHD3R(!5>P=L1jD;_pd8#)|%e#-&Wj<)&
zFPYiFR1@nW&gyJ@Tua>Dy5!TSJ1gwHm<c{99n)kBwdsL%ZUGz_+nu&val}sm-#^7=
zr|<L99Z-PwmtLwCP+iUu;GhOhdpi>aKr&rLs-dchI`n=g+VS?N$U<v5*w-#VB`P`b
zmV`0ao~O?^;_V%<YDy8Cv^T)@-r)Q;GvHTa|0^^~dXb+yx2373R-kBtSE<i$J2!WZ
z@S@s%&u5qxI%m4hhSx{qHM6q9G4`L`pC<6r`>aQu-}e>?*&G9R+lpkUJu}3D?IB9k
z>Afg_U3&})u37JGkX%`~C-2|?1ZHt(r`ODMtwHCYOt`Xue9wiRWVgqtY~Xx$PL55$
ziqydnbg#3r&vz+G^TSIYqwMsm#}Oc_76y?!Y3}H~ZmPsZ5b|Sq(d>v1ZFEEHzLw-l
zb0*TqoZ_fqmCFzWV>zJQT64``r5}}P5T6?35<&}{-XFZ8`lkxeHjYc*YvM#-N4>S{
zZ6o4sRn`Vx=m@(HaAa-Om1s2uwpd&|ynRcF*hP;a;JlursA-ehw64w6#KcrfICM4M
zvgNVx9n;k-2Oq<cn(Sn?(t`78<>EtgnbjS<kyH6hfHIriR26&%9SGUJ7(C4owu3QL
zwHZp+O6_(STiE@4?^*lN+Ptn5J@oKN#t!{ZfrLO(!Zx<j6}wYk=9&sy07cPk?X^46
z{42uz{UeLy^cN3~N?;Zi2M=Kto$|whu=0h8%3|G7OEu(3fkp3<zuja~PPZ}tGFn%!
z-of@&4iW7u?XDokaZ?wF@}5bmVIxqiRG`xQ(I^W8q1R>wWW)L;WMpGWr!#tN@;P^T
zM=_p5af^K#AgfpeRBA-0`Q?o|TKhTmS81AxVfS0dDO}r(Xm*sNeXt*uvD$M!mA9>5
z6gl34N^IIr*yYO>mRo#8)YaI_Tk89Cg)xkj8<syR%t?Z7!XXuakq-iu-;H3b{7Cgx
zAjim-43O9(bH+g3fnFGq{IQ=D*FUe6xSU+d>c*>ul<%!B6<#*Y8Kux|o@$?cTHqqS
ztW_kZ3tMofuyr1eZxzdfO{EoUBa$+S(1}Th=*t@$;$)vI#V@_z=+STK2*8fJevFC5
zS-|*<@mqZ1s~0??KB&8GTjg~e-M@(p_A5#kUjTi}&;2^&z77_6-Rt(YEGlxuCC$;9
z^#FASaMrqu`*`)-qB-c2S0B#AIxWrs<YQU&ivg{?1Sv<3s)MB(K|c4K*^jY4j;P`u
zS|z-t;>n{lzw$1JiG7pl=kKb=C-N<VbZxKc7wYuZ`{Y(3r)rDqjbrHl;x4V<tf=>-
zN^Ui0mBa3Eq`3y2Da^8W<?Z#7So%*l&P@&#FgS^=l64v4()fn-*#*$8>w0f?ZVlCo
zb6Xp{qA1A895_Y1yrpXy-e=NNa6&seE#;+jPJaA2i877>D0}nDN8OVRTl-;K<JgPZ
zA|y1an0B0_tqEa8FMpt1A|<OYs>uLURw4luBP5K`CZY^fiOno565T|e?-^NikMHbk
zw(mgQGwzJHt(}cK=&zY2Ain64IImruKC5+%wc@3gnDkA5z5OUmGoa0sWVp_Y5$@&(
z^2zZBR9DsUqN=i@-nnC7uo%I`i{@wj$eGsu>!wb=wu&6?-R=lv0DS;wRkOE`PrkR=
z{9|iim@$||{F>@WMs6b*o<XjN%2(`sAT*BQ+w0vV=+TcYn)Y|bT`aCDO5P`q^n;Ig
zl_w*L8urdMyL+W}3hU6=n^kW2f;#K(d`7OWIyAhD3Me@|B=_eavbE^LZsw<k`dRd)
z;b~CVyrJCU+`7H=uQAM$&w<X?hWoX>?cD|*XOD#XSrU>Q#;WAM*Si;}R+h<T4r*wf
z8e`M1UhDUd4mf^8s2`(X*|BE`j9v?23&qQSRG-+w*?eLDF>|)mAE3iD;yho0w)_Y?
zcp%;3qAr(Q`#Co|*h&#%X$Xm9d89RYDyWS0{SB8e@p^0hF&di8e{!6F@OLVR`Yy^k
zP-o)(sq0E7m=y1XNQnxIK9_97z;RoVJvvj__%+qP*FJ_js3FL!cZ0+9quCwcmrc4P
zPB*xOZMz3=|B^iY`>*CsPWIp;d-X@`%it7;2b8AA1zFSm?b-fm@8Q}Eo1=->`IpE&
zQ|DGSvuyJsBxQk)Gp_UIt3f}kK!q<zfz=n~FMZsg&Ke{>-d|a&cI4+$sfr5Y*bqyM
zn%zB)<9DyHsM*w!>{8V<tk>7~sdAz-{aN3U?nY>~n!uS&!?wjRnb^qUm9kG;+DAcN
z`uM?vr)p}-ZLn6zGi|kcDA4bc0~PoB{G&Ie=DlQ!dD8GwC%<rCO#$>v^f0D7+okii
z>OnvZ$?+6Z;M$_ErY=P0G<R!qB{VY07Gad5CviM4`uW#9%2M8B9;s*pYGI&#F5+M1
zAO$g30B99TpY#jg1ltn`$wG)z=|7p#SR!tGpf^15^q8V!0(UuNhpvaeVritZGE?64
zz&+3ZW^JyymGKKb53$H0zDn-qJu!;mUUypjUy`3Y)HvNqL&HU&&undo9XmedBy$y@
zmdt$p9nX!wcV2o0VU4*(BIPC}<~9!;9%In2Egq)GmPGYvkEnQgv9PjUpCsEyJIdX!
zGtytnZt$VBp^i*tBO-atJ^7b+n9lHj?kz|^)k1ft$eAS9dOs->&1BhSSVZYcvE1}7
zvp}~v{OP1u@kyWnsz>U-&KPhA*fTL@BZD`om%rjWXw9!j1=oG3<zWH%_zU71Pk(ge
z#dw^y)$-j;V0W*PGWVjFk3lrg%I$PuwPA8e8S}0;R3Rb)6=6o$0P2{T`FD+AE6y}i
zX4fHoN|#AF$c3Yy_3$j#LBJ4QcGl%5fXF>DDXh-;S)reT(7$i}0N<*Q17<a}xrT&(
zb|2d5=#dlHTNDJ!G!%FrnRP!c_Ff&1<a6#a87?NvAErLm+VYo~I$UH5$V&6q2VVp2
zRG!({Y&bUJ#(xED?By^N&Q2#5Ohd6mWB79+S<-MsMEFNe^B%ytFf}}EHHE`r5^V2W
zOj56o*zJ90GCmq%PaJbEmJ^q`Lb?wss(ZCJnE+{+c<Z%)+-`(-c7GFk`j^zWqrNHc
z@|ocR-7^#KbTjdgUb;TpYw+%=MyS3>v0ix#QJ-a4nMJ#P3k;_1Zu6kPKZOd@;O@H`
zJbxm~yP?TaR!k}QKni@Npk39Hlccz?wcF(H)H_J+x`_pZ`3XKNeDEVK0*2IisdDqF
z5YAR*yyM7MHZu43kH~y(5BpCz;Ll`o<)LUM_o=yD9Q1jYxcnalHBkE$Z(jYCeof8~
z1gBLeTnj)VPe~XrcI~RmWS^kmr&EUfyQR-DGjxU%%Mv?V>Uw&K^#~+%f2^?FPzg`x
zMpd%7=hL62S|+(Wa)jLPTJ$i1i}4%UIzmRr;5LnDjkYQsAA}xhRr81K@d1>tZY`4a
z7hjfaf0}sbL#T$B7`xTb`o#8ZygJ|f-sSO9S76rTk->?cOHqxE5T7v2K2kgBT~k|+
zTKdt+sxpO3G#uF%j?bm;lAgvR=5l;`Yw#I+GL6=LQg-ISot@3`heR0I7fWz2inR4t
zE49UuDALK@eS+(b@(kjHrso5v-Z>1wJ%RdKL{?`$7~7+p2e9+`9@_=`-d}r#5nXv;
zZOFbZiufWVN3y5U!=OBJ>*vWoCnc8UGtq2k=LmY@dCxxmc@&ch0dCQnb6v`zfF)+l
zO)F1D%G5Zqph|l(ldukvZQA<9eNj<+jFAfaiNV~$47m?^P}n`cvs541_$D{klJzQw
z;Vp;L)s;zAmQ{R(7yb!aB4?|SrDqzGH~stm<KpG?Ozvi5>TdRxI$6jfk`7`cPI@xh
zC-a_x+DF&l$GGJ6Y1aw{sBRE&xgC8ecx(M;m|pk_bASXrQ0KpBgYmrkMooQCVVt_v
zTfUafDZPu}I~4?|faQ^IgMJ%#R36~@+CCiVY<ZECl=R^qt<d5o(k8HR@6;y=FfR=|
z*GW@GXS7A~FB{+|V-s`v^o}=i?CaOJBQ}5oej1=yLX$(R5yR1wyKv~SpGd_|s_}bf
zWleHBfaI@1ut34?siTAm=bJcHO-&pS_nj3=tDm?2>k3d+l(q`<wg{8rAkMrdfHzLB
z!JT|jcy0Xml+(904f+5yDR4o_`2RM*|E%;S7J(C`?t$#R-+rxcpG|5MNV64Mox$|`
zh+Y%ew106~8Y!Lmlx|~#ZQ$lNI!sP6P@I=n-NC`(X7C5apSxGih+%MYBYjV#6L#1^
z5ry_i9Z53^>n7errGMyl1dl1rPqb&|G(fDG^8!1xXbUyAoS>UOJ9g5y&iM*6YlcPY
zG)j?|i1LnWKYD}}ukr+n)t3_}3C<I+V07kzVu(sFl#)AoRBvi(5i|G%hz+zalvO;f
zqC`c**C-aUB@Rp8uZP?8XBo}V83^Ugzd5~MQQ>w{({DdnC!d5J&jx%tD>th&K<sE|
zo~<0p^H((NpTCQ@dk_~Q^Xbyx>lswW;Us!5(f7Vy{K{2J5=U!=8q<T#JJuG5`Ckv-
zHs;d_E+zuQCp(V$tS6;p(h+q9iF@{-iu^RPC)LMKlKd6+je}!sJVX_Ic&{s0_Wmk-
zV@ccE+WLyP;*iC}60n)f1qu!IL*M%0#@=iyf5!m7Beb&jmoZRs{7GLKP$pR@&uQ}x
zAgM-DuwF{G@xD=vePVN^&zX93)rqIzy$wC#VMIPQWhG1FN>lKSJ#&~>So#03_n&c1
zZCf8Gysg-=0V>r3hzN-E8r+B|2uPQ%AiW7lhXghok*)$F9YI8T?}VVzl-_%SgdP%v
zgai_jyepoqqQ{)){cwNxe%apwD_LvJF-QH6GOJ2#Mpd9&`0TJTYjBRdfuPf{F#+LH
zz^()Fc~1f=QN4%w+-r-2IZnG~+6_kxjB|o>;_Rsd3__sKvd)g8D_8H(N}{AI?Rg|b
z!sk*ypTNyRcrRvX-Hh^?b#I;BiPO6R?g-bW#iJEYy$Wa``^8^Z6`#HNnullZ$&mAC
zA<(hVYrg`1)BAy8sZZVU6RON&;u|X}<ggU)oQrOfwrWNgY1*yuawi7(>#d53Oi?Ys
z&d#=Tmah{&>3tzXGy9r88Hl+yKpZHhrB;aOf(zaWQ=}7b(0p$wZC8|`RUHgFK)kp7
zcfM97XPuA|vfehuD$sZ*XUE2k6?Li#(5a`GGG3+Sy0*W-xa``A?2gi1xS>5j;i4$E
zAUp15a(;KirfCpo+FPK+?P*>?{;tz=s8@O|PxIG$m#H6zTaAL8>UdChLSfx6>ww6O
z`0NJAFU04OojqjVDxvG5|5eXWy~44D<aIby)hM#M2)4ObvEJ0*xe~)c`HR_~SlaQ#
zHG+@y$JLIiBl&&JelqF$(NRA4KOIu@^SOz?H0v>*wG+1j*{U!VW~`XcLW_BC-5Gq#
z{A@l*%I~sizkEW`{Y<kmRKt~}23eHL_W`8}jTL*qS4Bn(aAJCm(Aw<GqK46iYrZx6
zU(1B=uQqx|(1x@~c8{>G&UsQ0x4-zm+`kZ#=74NbSQI;vUR5Y%jw!pWSu1d}_7hw7
zHof#vI#4oZ&T`Gy4HuhmaiYw~x2`s=C>lBm)a6JCM(0m&oLM+eK)jB&7V3N4IezjO
z*!9iLs6;8xzW#n1{E)b|wXdcw$;?aR1<+=y8;>ujSRf&QvRm*ZP%`-V&@w+Kr;tGz
zv4HW2{H9;%^oir{i_MO>k)9+bwizM*x0I$0&qwZrBXe6-9`>FSUKK#|hFIRB@ycrJ
z3G~!frlOM`EU!+oTf+8^kp*X6yWhoUOR~&Us5~GiJ6d!<$f+m5xAmeAb;tZ>zvG0P
zs`FPYZvtGfG%$_I7i3DOVpmcUMXjFHdfMtwxg9!uq#Tq|J26mZHNI_tw@P;78LV$Z
zoRt-j%jB@^>(>#5MB^lO;H-E$)$pZL)#dMZBMPLW=dlv%LupqHR)*G(@di;^Z$AW+
zKc19^B-moZE!YUqfsT=t1hQhlvV<k;cQu_panNC$$&uCA$EE9Dz55lI>}m-E<M{Ay
z24*L{n*aiZcUZ?ec9j*i6HA<s9^4l$px}Yf&Sf7?IdP0yh5hJ&dJ1-QYZjkRITqxs
zGpKUkiigA|N1i76SWX0~sHhCi!8~*9uD`q?FM7u}!$QodPgKTV{Z2YW4=V+y!fS@%
zu(iH$SGMl468V#DdCQa54W)J}N+*g097?pt%N++t9^459YJGc*FDZcRHqiK;2ZQ}z
z&L{Y(uiW-2FCNVKaa7)v`OXraDU9Pz++j5?Xl8Wz!0yilqT)gcr`N^Q_YM~bu86Ns
zcO3FYg_pEy2x5rcuQrm}{MfOcL2-R8h7Of~*%FldP4!Bm44OYFBZmC`C>|`ALcc_^
zOk;L0vP*9LXvAwf`*7+(@ubRq?K`*BZ#|1UH}|F2g6KzKEM4m{+CdOQRWUzG@OL6b
zL4l&$$Cq^4@Vb35`f}v96f2Xiz4%ly3rO?QVUGX`b`ychZI@nnY#L>-nv;{`QF+Y%
z*m(X`@2#M&U<U+cg$01n!c)wx-g&}o`iCYB21VOCY&@exnvU6kfc^=6L>e;R?N;3j
zynn$iT6_;tQ%QxAEX0sy&9&&q?cw!$HWp=F3kV(s1XA=wMGE`H53HlSr&6%@4f8mG
z^XrW|9el8ZQIo!%xHk^7lhF-*^}+2a%B5yV*|r~HqC(Cw8^@j4;Y66Hpm>aTswm<>
zrE9BEj$J7WKZx9ZjK?}FVO9R)%(LzySxc*7Bk$$-mpbC0m*V-s_X>{ttvRP~VZqA9
zNYf$TgKjI!*Zh<n!9NP6xTcQ2J;`C`%qIE3D)8hT00k|#o3xS<B28A|hsuXj%XeWo
zc!MxQ=ZZ&F6K~lg4c-ZR`cxS_6_cHqv6WTZzYkYo-D)(Lk-?bj-Cx_qpD3E|=-kRC
zIV1bR|NNCkV|)4Q#VQl47ucmw&iffGy5{P5yq*u0TR4>s_+gum1G33PgeT0d-%JK=
zU%>gInUj@T*}r<5LqaF|P{T)g>w>e1lxP>L2a@2kMN`k*QoZgzESB4AojL1uK0D&z
zwcBaf0&z-(>IaNV{)6N}F$lyg5=`-5Uv_<#;>?GV+y*CQX>j#ziP!eOPX1_`4>D0t
zK9p|_a&jCNT`x$l)8!>>ol}k?l5SrC<waaUR6E|Y8ccc#%T2ULb_5TZmkligncyTx
zk!Pvfs-qNAce>k4yS^6M{Omb7KqzH%fBsy1IoPBiXy_2>mIX(f3w$hx&B`kHWhu4s
z{$Q?YGrSUfJ)vAA-m|U;US!IB8$sSk!W!Qf2a@*{<Of9dJWAok)(i7f=?cvpm+##O
z&l!h#N|55n^6qwe6`nBTjZ92UMf?`oPjni-178A?SZOoWWginC7U2;hgH*3yf3m2Y
z|1mnv_Af$|S+zw+bpbHdv(l`ulhV84rKH!%>zU=Rw7L0NOJt#}%z31K(oZeZKPvcG
zJ~rOnb;h@p`9}H=8xZRhsM_J+$*(}oOIUT<GYgCI+}&xkDc!Ojc<Wl(yW6;1!t?l~
zn?&Q7aJb=Zaehe<J{@<X5(YYc`v@D#Qs9(K>nD5Xv4+i=s%J?_qicE%Bu2F)y<E1u
zNl+*1FUU-t&}0Vg2L_ZbxhG#qe}&U45*m;jH~2w?Y`&+lKvWp&<h@*8Rl3$VEOPKw
z1K`j|1NIOAGd+A>UKFu!aBOeEudmxF+m+ptvJhZ48gaGF)-x>j3LbJC<Kf>Y?*=y)
zD>PG$`{1_s{lRh5djV-TCL|iw?XjQpgQWpnb21GW3X1x29~<s|Rs}T|c{?mkQcWy;
z*f9K);lX8fANv=Z_s;z(mYY!Y!HOS3hAJOM>2-U&-f~LxzFuhH=U=bJk-4izx@z37
z_CZ{JQNT=Y<98MRN1)dEdjflb#FMLJ?}5^^L*)poolA!`3a>F){I#a1PR!S8-og`$
zkwz`}S2k|&CQQn#M%mH)$rn=QS!Ldz{Ish+e*D8PJt6VxgrD1p%CoYC2X<Zfxu5`h
z^}SzGB8z{(VF3!_f!C@`Etp-_2eC1rN)g|?+5fndqjs~<#ru5?^;d{sIbuiBl3%37
zW2ieCt8v7&>uuFv!+K%KK;=&*c4&U`Ir8mCS&Mj?y~dWV)yk+jQd$Y2fXhp-YcpRu
zDlr>eJBr|!tDtrZ0HZ%27Q^6PkU~=*w>Q{F@kay(Uzv!^gTbuNbl-~s$VX)2J?1>X
z|1s%Y8V=87m~Sg`_HSIp<{tLJ1+rr8bB)<OzqMu<JN2A^zbvFV6F9!i!46#a<m9B^
znMc<xI_#SAhN)Scu;g<Z8u4Vmvmw^FYcYSsX<6-&)f0@JU*@y)@H_OM>Y>2rM~ki6
zIIo^~d?_j2M~|40Pcjri73?uBP`VeSjwNgGTdGCmu$qsp%5Ao)>d64+y8iVpo?_?O
ztnMOpJrnppo`3qiz!T)X{N(ri2MU`;wFHiy2CwBs3W|qmT0FnqKv>omJAj!I$@FJp
ze$g%TWRF=Qa`~)C(!?iDGrs%`kx1331ACU)<c|y43y%WT{%)3sONJ)5?#~*k`L5@j
ztFZ6Xs?tJV*ZCwKo)sW}5+A#+wj&SyMCEL${DX@sC0Y}A0k3Gp(aYvxp(=G{F#4qZ
zbUiQ*<H~T&+9NI=o*S=Ty>cDhNAR!0_g1Mw`a6TtN<hzY^|DGUF(VeB%=r%9+gaR_
zW~mfSVZCp--qh_q^_>V6J$Yh%Shke)Y_!nr60MP$VfCT)p){7E+kQWgZG}aqx6F%$
zc2T%5+|o~qdqk*2OPibNi+SsxO!fl3m_ignko?Ql3QR+mKIXtC+x`Q%`1MPe<M)Pk
zRG3<8<!@p2?9%E#G4c=D&P_BMirQ<LX1C7@$CLY;KCR$pmAg^>LNDydFVHILw}*lq
z6W{6+qJS<6j~c(bhtB*Eob8`pllG=`SlqpFrM|<#jbEu#x=b)TE*m~hdeGH+^=d=@
z{R<a(?3QMpZSMV2wiK7`8GpFPi1A(buyR$BMzl)q-EbB*X`kD={2}w3y2nwd(=;#3
z?EXho9{Z3!NUOi^<e6jw?|p8uBN#eo3T3ILo1&LWO)YDTd(j!Cvh^Zol2jQM^b6sy
zg_yU}f#ZN|dj$LkMlavIvRa^VQct*gWEX)^UdlifV(57OvECuAl~i^yKxL~BtUxG=
z(NVr1OpD)1OWHM|YhUT>ll;uJN;x>Gl~^K>!$@RP`EZ*H<Qc_n9pvys8~^et);_TR
za+y2E0Wg2sz|<6YzG;1L{>^&*)%~sLuDfO91zgZ|-RSAdkxTl+6O1kPgH-t7i1Nxn
zdBVYRaK8tMd8oq&P}v(%sB({w3_i07ug*xgBxYn-ps^zJg1tlNx(Y8mKXEpxYUVz3
zxjz{y)jTRU_G@b4dzHgQi=Be3>1krlp@Ptvo&GW;%M^&|E?(adaWypht$);1HMR*h
zamxI!724Pstc?3&kIJ#nXAMi1H0pHic_8LmVNUgUE#JXuRhbISf~8|bN<3&8doaFm
zU|rw6LSL+sS)=-&4$7+{2Vu`~ux0u9?{%QxqRMaCf!M1%;?-Qrmg6gYY<kvR-Q3!*
zojJMCwvy^|PM~#ldSbN38fJckL-T`=7~0C>2z%wOwITDv23bx8c4Lq)PoZ@{pfdQ@
zR0<Vqcm2l5I4$VVjV*`^=d;vrob0p&&0nfpi1<;n_>~Ne+`IE(p(3Ha)lFsTCxE1t
z^d5DkWRraAVU+H$bDl)rN?eK?ZaH2TwM@?E@`{RSE{=KnG9T!z(nm2Le)LhP6U~f7
z9-l=aV=~>Hg*4cKwj}G1Mr9}{RK0)=Jv)1^f;sXqQ206fG<kT==1Gc{(9<RTYoS@-
zlZU?V!$070V8+4SiLg-U6W^2UQTzZx6Bt9=pA|%15(SGx9RTwF9f2Q+_P()i{7W0V
zK#<N^mUb0x?TO}3Vjw@9{in8@!~DpQkCQXct5ohiO?&=4#%)3ns71-u*<WG3LE7};
z7ZYptmKPfot^?&olNh<Bx=S?iyJe4S%^KhzH~-T-`2)(9ENB$QO1Mb7ME6U_%^i65
zxtTMFzmo&U9yrbXaP$wdWy-kSz>{$?mru84s3c&@5Prnhbj&nb{-L&uOq+@_c1pYG
zq7<L5wwbTi*=swxe6Rl<YzNFqP=8r2oyyxkaX0+E>B!hKMwnH_r7<WWpp25baLQNa
zN@KTo#Nlsh@K-7U)0PibI|RU&W)4GZbGw=1ZJZzFACJ*KX;7gR)#!~6+)6@U=*Jf*
zw(tqanZ&fnr|*tQZ1M4ac^U__b_^7{ThwhPqzUzQ`U~}!rVRmehpxFhA^POOziraY
zHr~Wi)i`TtHS|9Sv+Yy5!+zjxz>wv$juhXjv2ure>56lf0u$2qn5wn)bzQ1`?Z?*E
zyph!F#(sa5S}m9XWf0bN|Fm8vY+>O(?pumhkI4_98!d-OTWuju+Uqx4-xu$?-2+}H
z3e@H!u8@q&Zn%qXbTxE&jBI=IFNB#&qcIBvGABWFY33v@WwOKqb8!L{(*{3hx_D5l
zfU2pI=*ji&lH^tJ^{hJuX88Mo!=BLU3d)94ZidAXzH_oo2}>=tZHZ#kmcNgc?j-Z^
zQ2nxsr*@7{U-&)HNK<c7x_3-^?tf@O*Up8h162x&H1)|a5AyfShx>d!CSY8twaCM+
zc8kTncVvmDQ(cdP2!Vqx_M}a1fiy>cWNm&~SetgE<u05ua$1*fLihwZT5WvU00q?0
z=mAo<tYkqQYvAqPu?<l9g&05;E7w%%4kB=N>|)(4tX;)gggVZ&G*VfEu0gzhSrS&{
zxjLI(#X7@4e`VYkr%I(3oKc=xn5}L%!v@!sPM6k5IW)ohAmk}9mFZlbG_#GsHGZeB
zhj9D~Cx2CH3MDk=XmHGHNtAG}A99Aj8tP;P%Iz-U)6c(G6r+^X&CV{Cng~^fDAI>}
z0qvEQIkAo!c)I?{)TyD+7jIJe4etDJuDlqxQ!p-wGWwA^7-iBM{8$$7yFawnn=TM_
ze#GOy5L=;9cQ*hRq&@fha~Fm+Q=?%+eQ|Md&lv%-ONCRleXIv&%<P_=`nCx4m(@;%
z0S}#?TMTj52O8{gz@czUojKS?{zY8?3t8yq7;7?J(z&x7m^lVCA9f~Dp_K^<K$X=(
z4#^3H3@gO_+L8WaS%EwE0B^SP`SU9y(t3|&ieS%x0d^f*#ER=uHk&=A02Bb)4OJJW
z`YxG@+1`<9mGbz#n9}&;qQFi7L+buB4=Bxg#t64tG0btoFGp%Fg^CUq0_{E;@yo7y
z!`^gxQX&1$PC-s|rPZKOp!W~w>zn!dZtN{@HD)n~3aA!G{M_`ko$5dnJTSB5^#>_B
zY6W=TuIE6(a@lUxvS01w_Xh!m(LiQ{i@oc&>65Q}^zg-Q6{P@qDJflB+ePcp0IUDO
zIo>+}CoDTV8&zP79jNKPQvsdk5HPNNKU4$s@`ySN<Z%l%v;U9Z-=O`TTI`wve~GWD
zEeDwm<~^1PnIL>pHw0?w!ooFvO632y#PZN2)&Nk=0ro<Jz+49{pyxliurq0>tcrjE
zP5qpQ|Ai=7YBYUeRHaNS&e-(v@%5SWjRNfXFO2&A&+6KM_Pq-Q32RsuYJNmS2is$y
z|Hf`bKRiDfDM@F11?|VNJb?!a$3{sErkN<rnJjpf+?x&Ltu;Kp4`_NUpO|^W>EQ|T
z_7x6->!z`j>3Uw0(17-?^s2sbKRiWpFvmdYBOf0y!^aLCE|-zbS)&NF;|?w=-}oO!
z(D-hnrFD;ymEKcI1(nqaYiVWmsk5^)m~Zh6eU%o>@^JAFsMsNX27hxc*TQ2!_GC`F
z<%hKW*C!RAYJ2ZOCZ4^=s9%1#svRP-gBWX^3i21D^|jYTx&69}Kko34298H;>VOJ2
z6oZ^<fa*#8*gEB|N1H(PU)H;LdHM?FX*TV-`URu;ffB~A>YS{ujmW04ZA**)BIPO-
z3SG8~f}*j2FKz=_KE|?i%!Ga#E`4J+t@ejq0^is83h9p4P0``YH!nP<eB<x-$w`4>
zmm_rfhU?Cm-NxbyoQA@8fT0()=)hHP>C&4Hsh8R4YS)Y0U%l*&xkEsiC+Zo4lh*aS
zZxaO4L|CI?X&QWI-z3pz)0wY+)*F}yszrYC5n0#7m5_Lz{?}t&1C#svOl+<QBuVrX
zzP<bZ4;lQ6Fzlj_X83C}#+7o7`6P#*uny)W@T1iVRoJ#`tRFi|`_!nk2OtB8kTfek
zy6Zge?5);wCEleEe`Gj*VT|<HeKa>|!=LQlEdX7!eDG|<&sK6967sVa1@hd$?ak_L
z#A!7uv;0oyP|~2JfTFTo>K|LG($63DnMkKQytVw4YWd{k1ExC)v03R#N0pQm9$d;e
z_ckD4*U^)Ke;u%l>n(k-&+@=p2($EJx<HD7+8T^-xMo81R+71oud_F1J_e3WCDgA-
z&DR_ocUJFG4ftcn&fW3}3Z{E-G4Lb~3yw{FHqU__s{lsUIzX5&m&_AluhocswA?f8
z&ASgcL*wfYe*T~!e{e=u0p#Aouu3v&Z91EVINu^mydA+#@C)|{VK1%=Xi#FO^J6Z=
zp0!Kq2#a|E<R*jjlfLIsF3o$gm0rW@e|`q-HNXC+(eUD(WDPf;`AJ)sB?6cuoLQ(d
z`~K;J->!fE;(tEaxCPc9SYQz<h=@0Z@LtM$QL%Y_^64*M`nIZ>7#bZmcUxTk&$IsP
zYxmp_%nWki4xjLc3^8b$YW6-HStg~slWw@UXqL_E9RGHspXn6n$I{jwKxxkuG|d_B
zCz!HM)_R!Q${eP%LIwE)GnNVviUDtvH2;2tOkCVtX0|YJ<neDv|3{$Se&EY6LgBbl
zn(3sG*p5hgcRhOmI1%|Q*scYr=K3A6_6R-I-{Ie@-0kP+CFErSw08Xddjk7r2AL(s
zkBnb1WresMRMb`b{r7ecaRk;IUW4I_6Zik_c>sy;mIMTVo8`Oc(QWV?SGt)%zLN@_
zC+J-Uu9rlY(c7&G;8hjPSuPtNXNuD;NVGw~QP(v!HS@;ofODRI`%Y_WTtz#HSRtxn
zgs;z^c@e1%gfXC?&F%mEiFSkQLpGP5FHH+Yp3#xbs_`v+t?WZ>%b!qGW-<{Mp>CNK
zxy1f6UwnUqz(0?-<pqF8=2)$k`>t($X23A{qHEpX2%5aj%M0!ACb^d~smJ3jBfF&N
z?Mm>ur&WxhCiSmn;VEn26Rq-4-wW=}FqwjnQcQHnlYVFSns_>5vn(yN4#UcM!qivB
z&ffk+cUV|%^=4GabJw)g$%D-!lKX%E)(R&A>&K>fd>>i+k=wlR&D5sVY_r5M=Y|VI
ztRo-_scYu-M_XPt=AgZ}PNzaTOt<@h3YYUPVna@WL(2paTTP<yPDfV3BR4q(BMLxe
z8p{>!l_ex~Bj%H(@91&K>>gkl8jR)T<-k#KuCUqG<(I$T+<hnO6-*g$-D+MUl+DH%
zVQ(VI<&#TDvuT|ZyS_1@YN0}%Ao+RW*q&(Tf0WIy_xZBO)HWW{)UXO!X&!DMm3dLJ
zB&Jh5vs*K`!(%jTCmWzKPa8^0m-<OL3Bc)9zau>G4)DFWDk`~m#520P<%6f`>2W`+
zm_xY2aYnUI{qVGu9BsOr%=5Hr2YJKVG)m5MR5~my>`a0OYD`S3FxPbGEga~+|J?(R
zUc3-ke|SM+(&o;V?|VqnU3u*Lyjzn~JIIO2+SlmF!2u;MXU^lKEJb|iit$7DM~jBc
zWpf?j)Fe>>C*t_7j}&YO7))5`tI59;5gGF<nf}onqQ~0C1Hdq-#}jO~^x48{eC1t3
z)6*w-yKn9VnIlZ584%KyAFu2Dp1uB{3B`DTeB?%R5FzTrk&*Lnx?i%Z@?WxOto6@O
zt`ZEMvCseQdbt$R4Fc;YgTQmkHNq4#Atha1cZXt1z`{h%WhWTvbM+>kAu}T*cl6_1
zAiDVNyVInJe}U>X?!P)vR@4No+LFD*pEteH4>^vNH-1^I?&IuZ{0RRs*xVtR-aCXU
zzzfqM<j7kt3p;#oknm{piT)GMA5x+aW+#WqTcp*gjroi>Z{B=-K%W9M0#ZH%hi7Q4
zU_z1n&gv0hsBaT<(xFgv<=6#pn`Nq6qT$7smRrAH#_q&@GmZ+d)xEorCA_8)I1h0G
zXv<-_^23XdQ`TLHN%iq@j~uR^{$C2hgOOOo;eIa}m(o5RI&Z3WSwq(9`Wl76kOrtU
zZsB{A4gUuX_Tpq5>b$qeEvm>0Y6I)IV~uS-=q=>x2C$NrENh5>v`5?cD~W4*^jPr)
zOB*WI7kPMZ>kY-d%^~40nzn+&GXVxzojosKwZ+Cp$5I9UbpoJXLuI9SA#`!Yr9Fj_
z8#rdM{Y_P+)aFZ}rd8$WC(onU4;&c}pr_!1zDg>Y+*?iQCNu%^8O~nCeg>^=G`5A>
z)JIl9YWd-5auR|Y@X*6<U)1UB!2xxehGTWE6MF1edCi#GzG8hRnmRp?)E`E0NWeWH
ze)9uDH>wI#MCMP@wcYzq&ZF*;aE81daIj6xXLjP|<a1ykl`hVd9{b&)X+;ENBWm>v
zQs}yxn%ensZdLvmq&sQdNe7|FP*LIj3{IuQ)H4h5MSCA)_J6KwqWJqIq+JEX38*#C
zILD)r#Kh}AU%Jo>HJY>VcK$@VwYk+!eOgagfLC<lGmG-_!o!QI`D#{z5zh6k<om5@
zCus8CCAKBJ^!JD9Bwn^upC^t=TVMK67Bq3tHaa|crLcCUW1&CKS5bR~-fEuFwxYQi
zZQh+ysj}C|L^tI$u8{&gPaeFZZvwsShJo+_r#TpJ&Ey(cz(Woqp9DRY@XhPjkUob;
z__lV`U?%G>%#LHgqVQBvZeY4vE}#rVZ={ghtBH)6$c!?Lu5w*^T6c7(H){_BxuICr
zS~J$}yIij}0shdkiekM{HKM-)pR{P3qNsI&Br!FTYw6HTvowy6HOQz5P*?4jvFa}j
zyh?z}5re-(Z*PHx9p+T?F(crZueQVb7@s+|f2qqaztCrvA|3Y{4=plksGHd+6h<Ov
zhvA{X_C@R>jB^ih>TnKeB*=^GWn4<CvHB7KzUAFJoz+`lKYts#la3{@IZ9K=L&@GQ
zbJ5@vMX+_J4|iFiApeyLslzY=XM^JFs>#sWF)v(B{Pde~&L=Wph)cdivDp&CzBtXk
zk_#dYQS2y#evt7^2Ao`-dq=YNUD9ql6`AYg%<$ocnuKFNKlko4h4=wj==euDz3Ve<
zz%{*#H2bv<Y2dZ$NMuET&?6hoVvxvih?gsFVMwg7mJ&6fcU3M^+hFkk@6Zsha*E_t
zxAqvA-@^#WIbUHxrh&AFxF-X5RWfpykWVUebHQd?f+_9K?O*5(Jt)+@dvH&#{qXx+
z3CDkVj~WF_BBIGPI!=-4VR&Cu&YnC#Z1TlvX3-2xPLZU9Gc@&vpp$mvjGP-Z9V8#z
zJE`R2hng@`vRrysf*vnQpej6C=#lR;TGr7SaE6w_8!5t;#N>_;QTLYo*Ao@0*)!1h
zR832Vp_V`E5)&7S#nn5Kf8Ov$mEF18RGz|TzWLX48Z|rxzO1s!2g<Ts>>6H;(~nh-
zFHFg>xO7OP=uX;Is5}mC(^mWODQ|-WR4v5R$cS}tu>CoCK3>4+MXbF&ija{e>u5FM
zM+y>l!%y6;%;&-QZY(}PrJ$==6C*wce3ofh5RPunm%^Th7IYS5wzjk#^Q~Ou*mRUO
zkN^e+{l@48iV4zeo-clZc}2}1v}4RMyI{IIlQXQ~vE%V*IkJ{kyy-jmi=ti&j%Zb|
zyY}1*s^3CR^#TKLbK5S0%92wJ_T9$pic@>C20$RPUJ`e!a`c1{Pd#35*>fP_<7#oI
z_!CWD|9WoqxpTo4T2r0PNiB7(#+7<qov(i(RoHIvyT;5(g>M7hXbPeDPufm-QkJc!
zsXXep_CnsXC^y9`o3Qz^6ANZG+;SKy3&)Fw5Aasz^1{5@;Lj*_m<mcdW%WyneoOsk
zxEABz0?|9jzRGnToX$o4HQKhuZkq-r4||^v49T=>jYIe)#J|k~Ks<8Bh%XIk1#-(t
zM6k>aQY)JcrIq^6{Y<mY130w7fCE&8=5g7i(3Deln%@Jfy?{EqNC6|WRyz(lw^*?j
zvB)@R!(8S1xfp^oGBOqh%ZzYgP6GHLKunptrRbTwQkw4W=6FzBzq&Ulrf3$$PS3{u
z2~@}hfG4z1m;*z_4QDs96bYe#RWlsaJO6uUkn5@QPH(23Xm4D?DH+VB-f4D^M$As4
z-Ywg2AyJz~AU;Q#IPiu8sby5RghU~VB$+SghOcGuLyp|e?o#Tl^3=SYU^wnzhI*vo
zX!%l_`YyTlZt=reg3)?!?O)q@tCI!{Kz<GAhwK8oa(cG1S>l`fyDx{!+|U=38Q#UX
zaFii(;V41w4m$yKH>G?O0z2Z;NMX;gXnitp{x9?Um+ru};+a-I|1}r1-YzH*?qq`E
zG}Vu{yXx)gxHbJLY{t|?YaLWme0j{89O*@j7^~i5Vba9z2ix>4U-&6Dzqe;+Grwd!
zR$H~5f4V|~R|znWeT3&(k<tBe;qkE&b}`s%OW#3)Dai2+il-`a9^}XtAq4s0Sm_l1
zH+FQLUF#b06@%xKkJ@%f+{)$1GKdbxVmkF@rnasB!6K<L)bwMJUJId%FPecPH|nfn
zm_;`G(uqxc4s3t+E|MAf&AG0dlg=IFLE(oAF3Y8=dgh4t{<_<DI$dBl0FF>sWVDeZ
z&ZZ{;*E6{T-Ra50+sY|;_jv*e0{0QhQ4JydZ~}sPANHs5Bdv3ER|Fg#Mea>bOK$%C
z+-vKjWhBpVNjaE%E1A*mJoF6M#V$O_8;b%Q%NJCYq$3D3?+TvL1H91Vpvn0tnb-KC
zThP8dO#eKudcK-ohenDX%mIAuVBRY0;g+A`j4_4AwRCZXM+d*LTj0>aj!iBoyPC=I
zQXz=04{QT>faHlNP^)YhwU3Ub*4937F{vk*IxfF3{G4jKNHMir_gyhnazS=>IzKDh
zf=VFrO-^Q)ZiU92Surd7{J7br=@q-hm#OIs`4IFv5iw^>@oC-eC)n5rzKCb7aM!>n
ze^+e44ljP$N?wV7Qz@Dso>%BZsp2+!c_(H!3F^b#{6%iOwQoSt9FbJNrU3P=%Gj9W
z<JNpui&nHi5z3rTsYGL%doz_akHbCH(5h*01U!5@SJxcf@VccX=+n>e-OdCu@9r}i
z$KBpDD`Gv!owvZ6h_3M^UF72n<4w(vI<M^Uq}NlwwYoLWqD*bP`K`9<bccuva_e9_
zYodg=M$sqGN=Vi%ayM8LKC`1(sG<CFpV?v$b^72Z^O^hXNN3M-CtYw%*H3}MJ*M5e
zad`n*eBWN8ap}ay!}}&%gn~DDpYyZ~yuH0m9H^s2SZxC(jR;*t5MhZM-AQ64>o3-Q
zKS@ZGKAM7HMlE{<2WR@)KV^b_Q|1hBbfoCBh~;4V$LGAZBTif_JH1c$aC)yrXDE_9
zG@<zvP8&1n+1AtHk85uSfA=>OuI+zKNn~l}v|L(r^-)cDv@Z?3fO0M>a1f2R2T$^)
z`hyK*Uyi-tf$IBR57}4~=5|4^Gxc&;Tm@qXE)kV-po^@g@y2(V4{Bed3W0TxrPQ5l
z-UGT6o}mM$WGC@w+S?^1;q=umA<|^C#68r|mT3)?S0Q=Gpc;3Vna}Lcxu}D~iIk`5
zpDFO|UVYCCn$jaI^@>GFd#&ho7FmQGDX`1_?yo3-_ZvS18<fVW`RLT4$K*F_iJoz*
za}a&D2n`jviS=HWc5}jjm_8*#`)bap+tTwd(EE=kwS3!H{Bf~asi>1mq-aVdX+1{e
zOyULK5eTXy>((9EWF>mKY}_B$|Cdg<zSifh6B>g>J>ACX+igkU;DtN5wWOG(O1NLp
z;3MBEwSyfS>9ca0`<WUVRfUu$Ky%1A-(b*rk^i`-n?g_FMPT?=?SP^gwTjB9UR1c!
zUnf~nDsOIQr|YkaoNt{2{eA2!RuU+7`Qa2xnEANl@E5K}FvC52ae_t#1W&h}X?lSM
zWmZbvsdDFw^m-tMphH-mZ*O&1Eb#QoaN*b<f$~DtSyI{jz?%j`oAku(5|F1km1*{e
zX{21W*f;LmewiknJ=YZ!5Px6aL+AQk^3I*aLFasiZ+hdG@AX`#0dXbBwD6EtP}-r=
ztLuLkTDIFI9=c2Sj;vpm<v55lTmq4hTn(mX*drMsr_01oCGGm<?%&pGa}K~f$v|A$
zV(+XTHa1qvvrCtrQ6Az6#BYiw1Q8Gd4O^Yz7V$yqYLC}HLExd*rhWyZXH7|ED@ya~
zS)Hlc^uqBq6WXE*Dn~A2$XH8Z%wjx}=4NwD)-^SP{gcEl3kup*nsU>5*^9erP<H=4
z#Ua+Cyl)za1k^F85AF$j)nxrq;B4eyH{hQ&=08_}@4+3;<SgkHytN9=5wV!A>(8h!
zy}Gw>T%vN!{DaMKp^2gh-N=iJ=8F^+nliIcrdz75m$f7B4Ra0%71nRyMjm{4_rF0+
z0KJHY)JW`~*N|~}u#6jhRiZxJwK|QWvPewQ0^e3xBn0l45QhuKrC8BHb9!#l;%A0d
zInML=N$v)htyc*<oM~{*aGrGp@iWsZe1Kq)c{b?k{}n#-=jEEWr8lTOmMDf4y~cb{
zQce2b0N5^~V~8WO1Y-5xZ2<2X7#!qI#WG(GLpjw*@Hsd1`urcrhm#AoAV#?*Qv-T5
zB=w;a&N$l$x6~^K2YcFG4a7FM`8~quj6JwYH!yrf18+d-kQJ|uD1KxYBr05jM?ESm
zizz3h+36U=3e4!x<fEIfHqP+UVHg`d8s%^{b?#_3UQR-=6+<DAasREroh(&=W3HLC
zy=|cfW9ab!SbM@#>~_n$En>bWwr}5p{;pgjdMIt&e&9OK3~&JJZ|`vTFijr>EWP;c
zRW5~lmOIh<L-*8DX@QTd#OKbJc*-)*hg0rbpEV7~N^U_9{QkaNclOc3cZ5F`>GH0d
z8hZ+u2y<Tgk<D-$HWqor_R;cpm*JNWCjq(Hi-W;tQ%Br@NGMkomU3Vo>+v!eXTZw;
z`$fzEC<Q=7Jws+i3C?rL-I>nJiojs--;g}8l7>xz5pq?L%4uGFc6N4SPL-H%9{4u0
ze*N*<buMQ@6o6U<A-g?gPg*DY>f$gp>L&`_8puqHp^h0DKRMk$C`dzbcM)N5^25sl
zMG-8f{`@p(eyqyAegu*bw;sGEXGI6T?1`YQo*;|U@pIZZ@--y~pco>G`bxiB4nU3r
z`su5sIN3yH1KV2;SE^wT<)h_K{O{E_f$N$Sk*U$SOO}ojMxMWfSl?IS4PdcG6PHa&
zMQP}h8L7*%w@jJ9V+@ib#4hOQjlDLdXAx)$^YB9ETS@$cZC+DxZEo$+%XIZ*xgm{K
z&TJ5yJl<Dlf4{*;FN)Ich+10GvoX-oDSwYur^oJo%rxj$21Id`CC-EkjL88Y3k{rM
z?v|lvVAc$1Z*cy|8IC$kvE_+6d&(nEK*Grv1q5te)#)zsB<+<5BQ=yACGCnH3q68h
ze3DUokq7;ZOL}$sjifWn9!5@QfB=YQ4-PmkwEQO({BtBuW8JGe_b~Mg4-Ema8Z7lt
zgWG&*#|64T<*WTOXl4l(V@m5OYDasd?7V;V6)DQZ7j|=KAo)y++i#_T#y+=bYfq`<
zJVPepZ61u~<X`p}ce(3O`g!Wc`jP*BsjvT00G{Tk0GVp2JDHfZN^d9h4$h!Ca77Km
z!iCisXJN?a=?7OCqapH9AY`8;dYRvRtvsP1HU05-hWq&g7bndC!e+ZdrUxb9p&XO7
zP0NgPt)4>lJj?>QPpWv-Bh$!pLw@C=JajZrj~xv@okQQ!Mt%u;%zMInYgGR^0YTo%
zzbGUW9R%}H*U`~YvbAm1)6h|*bC0JQJH6}vl8+oHQ6zUWnu6ESBM*Y!^%iMF4+L5G
za)VduB?QvgB7=XUYiO)Ya{>y}5q1kLtMPRlVIJskWNL{<bB>wZg@lpun5Cdi4%8!)
ztWZ08OtS{;kA4q?TrG5Ae8kwbS*;gHb8akYGF)(ypr9!;UO?&#`?2>fot<jD1n|WH
z;=5g>W^xtzs#V7p*N|#lczF2p&=jwZz}^DY`Hs~~oq87Zmd{I5;@SRvzCDbmCt$}0
z^^6<_7Dj%GM7KTq;`Oh9^-<x-1UdvM*8{v$Lq7DjA8psV3akhAEsP)SFQ9kbe$Q@P
zL0n%Hz1zPE2xvGgJQ+ss`oE9=|Cac$WF1FGlYyZjL4700OL++ZMSYu-V~gLK=AIRH
zewSg@okK4-IJy&E#wd@Tt*x!C-T(%kK0XM8n4gborLL=NO+B|axAnHv`dYlf!U&h)
z6A~zT!fEQ`Pb(*Itsm?;jLcQ-ijaraO4NqzqQmVyT=({nm5*Drf>=H<g3e=Vo&AE;
z=AaoFm-f#qr+8hj-&#nU&JEXyR@J&jDL-8nBVIL0wyBr1Rgra_*In)KA6BZHo~C6s
zM27k4Wmjhb4(1G9F2RbskPfr7u;9nty)(U>-Y0$hv}th!Au(2A&qBh~_yfs0Gg(Rp
z|E+Pr1!qy_DEGvn`p?%s5)7y1I56O;(3?eV1bW816Y%;#T73Khp%i~#rEGO>wPdT#
zl=$6i>@t4!=X)Nb7$XumJJiP1zhyF<zGX5Jr1#Jv&r^XfF4M^-c@sD9^}f&2)H57=
zTlm@{<Zk_2Sp)7gcznZaWa~R{Yt(xFVn_>+AD2>58564=dN;=1^U8$y>OI)V98J{e
zeEgGa8lDF(YF;7~Q_95j#sH$FOYPX<?{=wu&>cO*_lf_~nL#~0U1cgoF7xf%i_&p%
z*L8KtA1wMKb8}yCJNA!HzBV&a5#~K=zsSixKK$D1#gRqm&&s1QTQ%K+<dHM+;eC_F
zaJEzB?tSH10W;N1iDaMkh>!hNO782W3txK0B^E>8W6CFcW|r3X;-mTc=O18xq#67%
ze?@;F-98^&apPWSv}eKQWwpP5T8N(7y<4-TFYD65ucmWNa@=3-JG5W|`=jDci+%F$
z18R*cT>iMw!J#3Z9e!$RCYOyWFY^J$-Px!V>5;@+;6!T8)YR5K5iP&^=WSnV(1?9}
zK@4%%<CRmHt+KQdfM{H$cvTb!>v`qng#1FGa_4<WV6xUvZ}6XG2Tts}dyV=G3*;eM
zxnZ?@nUwC3)Ua^yL#+teheZPvEca@XDvaHFYhidb+@6wHE=3qOi@dgLiSv*zu8nN|
z-rl(xteldN%vw3RS9t05vjks0`n()w3U<?)97=L7_Jbm=cK78iTLT#;vJaun!98Zu
z;d2JJX2KYxL8AyWIw*_`TD#GDWJoYD-ZZQl5)sl=Xp)vY`qRx7>~|m8`cu=P&e7h`
zPW$c7%4)s=jjn2}$-SX`gWfXh?&80|O!7I?)6=KQozj?#Xf;CN^U2$7$ab2H?!>2e
z58e5WvHbEqspCg`=Jy#&{Ooyrrz)3`f<jusD|)us^#WIV_px%PpV{sE;}t3c0_2Sp
z=~=>=o50P_+f4qn{O>>hBhUd5gUZKO={Uq~&w!gN_tgGA%-OMX6o~2^4qMWb<nzGI
zqte~!<ehhy0y=8+%qMzeQ3h_#b`ABjApfAkgM<4P%EV%~<AB>ZRUrz1Po@4EI!te6
z%tw2y?JsVJ<G*i6z#oBZfMRpoUBF6jWdgvS{JY^l_~<`We){i*|5ncazmf1E@xY7<
z$3N!)@e5=FgvISpp$t8K2Bd)`|BKrCE|>NS3JS2jJ?`{y&)v8Nh97eI69pR296j1o
zwBwNpo$zhP4(JLU3FRu@3y84&A13kNU*zWx3JNQ}+hq2q+vM-qv4ea@VVEA+3M`l7
z-{pSQm;dhWPipM{ab^zGxNwIMTIl*U8zl<BsH_+2>6#rNE_HiSz)wd^d$_pLouASz
zMRUCfSf#U-g>+<5Aqa?Cuf3P~Nl<=BYa0;M%7={9()0Cc2=3nf^b!eNAv9}kj-nS5
z|GV&Q&dCpk3uOC>MEI^gUwc_~Pm>kiO9zv<=g4FOt5nY~n0Fspoo{OTC~{`fV_<(s
zvxJ7E?28xrO^iZN`4&03vr@3_QK;PtJq`Jv4KZ8=6K^RuQG8ck+Kxhf<6rxYPe!+l
z$GmAe?`g`EA`{p0whVGwpL+ihtsXX^^1#n3`1{Sdj2f9GM$ho`5ipZ-j}KweHkWz}
z<kKM!C)`|SuK6m%<mEC{=Sw9qZDb^dA@vCmUB^xI+Wwx;H_8v(y#_mwwq;_`(%N1G
zJ%PfbPBUPtRB4r^uIsUjA{JlL8xmv=bMy%D2^rRl*1DsUbM)!y^RJ8>8dtvpj83~K
z=)|I_&l)Yit~K$9w9BP+MJNgg!&K4N%uaK&t6ue77%52=BlnCvQa}`0+s-AwALH$*
za4@jb8nj0=bv*g8b2KT+zn*SyK+?gSH8tEH(+`lwMzaxe&FHnQpZP%Oc@ZXF$mY~3
zmkxrt@ujyQpz^#oC;vij?j@1=yxhJ(#|uDIi2!YULAotk6ji9+yA@mzF;u8A;P8|D
zrdRA#z4Ri>H9B!fe7NwBJ(pz#8W}sE9#m0KpzfD1Ji<PvoMwu;uB4>oy6&@!)Ec|!
zcGWXc+ConYGnS>jW~$BK*g2ISyoYneaN||t&!uzaDqLh|^BU#6x1xa5E0pczI6i(^
z75NE>R$Dr&K&M2kyCZ?D>~up&TGTMYN1_tpWYhQM_vL3dfCK`PRf?YW;M{-r8sd|4
zuo1@Z3V5?_{AvLVGFDRZe-x6@!w9|raY+Lc;sA<qiIWlGZP}3oyST0}vUaKm=2>nZ
z?{jZixu`2e1)1319Tpt^;<}|Bb(03lB(KrSe<d;<?U88jE58agQ7cNF%+P;i@3WSf
z=wu>O<<+(S_k7CrRNx-tzwyt1tAu|8-k<fvHzWIR!24U@<6ng1w`AG(FT(L#z56eu
z^;@zWZTlC}`pI+p$2t8M!~AapP5aN&e=*E&c6mG7{J()=Qd-ykNkmdt1qFG39~o){
zDUdWiT+k=5#cvDm7VYw0v9{>OC!hBBaW|VZsRRAqcAR`0z=U)B@@~?DkhzL~&9~Z?
zqTeh+FkwN5g(H_h5l%reb#!V@@(X}aUVP&LCmX?-&ASIqD?<QUK`%$ixKgW0fxH=o
zg@uPP80O!zRk1iNqMg!Cx$19qh+{y)E&JUrBKNFPcsy|I>~=zLTQ!iwkOQ!M`D;>8
z6)=!sjeT_aqw!>kg`7R{RSD2dAGHJ84%Gh*$p?m}yvX9wt-z~wMEra&p#P@Ued*~Z
zxh7Tq`>wQVR)GT>LN?)|1u~jCIww;;0Zm1@SKY4Z>Pi<!KB!B&o%P<3zZi)}TSQN8
zY1kH!1JojZa@q_}zTJ{MVp|e<g0{|{(f|P`!%Exrgl|c^F^v>qem+lhAD?!1G5$cT
z-&Xqx`;`lwKi_4a==mu`Q)l{HATDaWM603Ok(c#`;SfUjMzr^3tIh<@MMp;`b~*7I
zQ6nOKd8l8Kir?Qz!424xIUT(c%)&EL^JCQEC*&w;5f7;aR!tL=6PfB~YQj8wlfQjE
zU2LaNJyEY73on1Y=&Nnj<aPq?&}r%2&qF?xnV6WUA({$QW#uTH#h&wL1dbv2PRavj
zjV+^&Uru{qbj}PLgY_;OmtVM~P3k5r4nhxOmcDIy;GobI$Cl-d!NZ<SLS0m=a{FPY
zb<dH#auZusB=h!8iKxhsLqKV~uRsgc{?Fy|NoL1~^Ai)(4ayWIeG`ba>{Vi}p6AOW
zGr+uZPM1zK>dnH%8uh9Tv_^MC!DFX^C10DH-=T#71qv;8e4KU*Xrc_pIxZ~g{pW=n
zmsswWKd_aRr64qo?3M6KL#aUVAka#dFFDUEk?*Q6M4eB$kPKU_ns5F)gnB9@Vnm>U
zghUW0^bGp;{8;pwYU2Uo7Ew0hTc+tt=<ZZM{9yI8XH_!*w3ycwh==BWvUC~s&Te`-
zAnq4$Mvk9J1?FiSqxyVaAMqGzaaokgZUUn3=4c_Rwkrnc=3qmQGqp+~C@SH3rL7>j
zvxT*vyKZxWq_G>cvf7X95Gd){25Qp6qHBUtn}FubPI=^Y2;n~NH5V)~Edhg_=gF$f
zSnRh?kz5Xto3?LduYWHE$5k<$W4AK<0zDyucG|$ol~mWuHux^ZUv+P1=^KY9yDc7e
znQD+}HN%`?9R%X`4(;eLpk3&kKY43UII{XYKDr2E*-Vz0AOOQE!ioTN!fUNY9|>Ez
z9i<BXQc=ylp8pDGN8W`Wm9l*y?rk#xLzZ%;Bs$c^V5kSv#>FW3=#UsJF9$YwWBRNw
z@9Z@cK9m*W?<9*KP+iT2McFRA(Om(W$;_9xMqjOt3P`ApaZ0RVUHTmvn~YBIwgR2a
zwu5pstl=A{a#45Zr+34;o1T2*!MqD!oG!(3u7BX4E;DjwhHOqoBNsm4BWd`<C;q(W
zq}vJ7A{fZb=!kW9cMB6sl!{|}mFWKbe%@#~bJ*)nPuObK21X((>TD2+x3!)6><CN)
zv@%>mz>MGUVe%|WMKuAf{n(2|Z)qb#WCTf_0xm7XQBPpTzF;=@1-P)q+{D}4>yS?b
z!DtgtHfD%gB&seOEDY2&z_tpK*qo~d-w6A3Y`m26b#g5%&6TEjjlUf631@Pi*D^JY
z+r3!SASEoQ61>>TR;jVXa1;{3s(lS_%gez5l0}YebaE;y`yhxzz>t0mt*i5R_OWJ)
zFZ!gVWmXQS=<R54PQfl?c}VZH&P{nSDW?T$^8m-@Jeahx@k#Ui^mDVxScjmOPEX-~
zzJ(x6d)-n56HrD*FQwc}mqG+kR*3|lc7v}wwTw$U`v;8!6g~q2WagEYaO@lId}hC_
zD5gedVO*e#irIgQ%*<Mp<^wC#Oe*5o+}!9pOM0*bb{#DE7<5Ad;RSu|MAE57yiz2a
z9R>t8^{$p?>Hq;~=v0@TYxJ7xwYKP~g(AnzsODOr3l%EhjkbPd4GZ-IH#he!69WSS
zho^=mK$H4<-V%#Y4ovx0NK}av3|^vX*S+D7ACb4M%K-I}a*KdMlnKI@GMBY@#xUMA
zZz<b`dBO4gh0=~1GZS3F#Y>mM9IGZIN15j`IFnB*O0Yjqrji8+&8j(Q^FEkCPvUSz
z&io;CNxxo!ZK5p}rP_UysG)C6orkpit#c3LSB1Vh54v@hoS#V{`-<0=Ya86U%x5<C
zwrhKpSupp#J^uIoO>cawmb5u{#bT&_#GQ0sZOJJ2X2=52q;;wcv{5|D<HmCNNsSu|
zj~SLlP-Uz>&F~!*UEFyU?Yq9r3{)mOAy{`uKdMfHw^$C0>Nkm#Qcqf&JW_kjDL<a&
zW{~XGqCZXRF!wm)V&}JR6yJ2Ps6MWyHS9nuP<B%hMk$#I*#~rBexxC$A|Qpkz3!v)
zDyq^vA&U(8;`Km%PAE==Fx1fJwqa&!QWu*Aj(N2PsZnxuBpN8k#SKPc%F64nO2@m~
z>CAr@HXsl99w<d%8Jf2&|401=^6dQZ*y-gM)z2q}_~H^gQIcC~&?2Q!)7UsR*1qh|
zWjF0tM2#-8&)Q(|yl*sziPm`QfLT0n=4oHp%k_6#3vsy`MfDm*aL7`sJ#s89g=wXM
z05<kTf}WE&fN33GpI;%eD;qXlaEtvSUA0q713_x`F>d@sizcv4YdHb*1SmUVMFPcD
zY-^j7`UCaBZtHILpJ#Y;)t$Aq<HlThhl_Aan;*4-!w^!$vcY-j4gR&(KRFT?Wq~S#
z&;C|7z7<f|TqwqRD8@jCA&FxQlxM9}cF@-1dSq8N<C_K$Z=X@uYeG1+eIW$t_QCT>
zRL(tra^N~C5w=B?0U*q{Wj@5rv`#Zd#)%!6uOikG1suQF-C2c56CJl2dGidcfF?)F
z$H+#wXAcD3mViIJV+X@W<@hgvo)FQlmn?uGFK;*=)*9%%){7Y)$d^rD0jeMU?!!!w
z3@m2xvTW|;Yn>~Rh>5%U){{|2&K%Jol2&QaRuYw9x`m|h8cv${a6^Dfp}Tt8F~%{1
z!+tuZqiXn3blJyUVBUJX$7NNyE&Na&hYC38epzJS_-XHLeO!0m0);3-v`Ut5RS}%~
zX${`VkfHr9^PtvdduqZdVDyY*MC5#CxdoY7XwiyM7Jqxf9kDSJ#$Guo+a*Qjzf}W_
zTJZxK;<Xt4$Of){AA}%B>l~;Fuan8jwV3h~$}I*grb#;4z(L6C+}X;A0da{F+Wd1u
z#Mwv}Joh>Yo?SJ*i64%Bf=7=xLm5F#`Uo$5*Uq^d=qdzEu`Ph-&V32OhY3e-@dNEz
zUidp&q`Ip1<yF&@oJG$y9A#jA`kG4;d;3WJkiOv^q~((qoDg9T884mJhbzZj`|sA4
zgWHo|IE`Ov3605a{mh#jzvxrfox;WdajM!_^v-^;WO&H|n-GZ{DB?KYng+J*yD&*`
zhi!o8B)AM~)#bK1%$ou~y0f-8vU<PMGXD-2^tcR^tU`=4wM;qUN+nONoM&%Ldojie
zt{Yfdw}{>{E+y95$G;)H$ky-H*VR|{DZ(;AfiWWHG>gvYkB(aunv{o1`(d_%dx4(h
z*tj8(Xys>lV4zjx(!hp~$4a737Ae>Gow%@xbi8=2=Li3>H8O<#L&vCFrbdzb*n?!-
zy5Mj_@Q6K;f!%f($B(AI+uBp&gn+-_tlWD1aGT#I#r#stUJtiCDgqc&;d;z>TdH_(
zSjbPLa<hX+^SRHJrRr?nrYd#WbNlDZNQUx~-SY8A*80u(eX~|THVO!y&c3`pUxEV|
z1f03U^(7jp=Tc7F@Qm>l)k*f8TNQrrWofc+4#KWTZGfJ6Oaj5O30NvzNx#Y~&GE5g
zG7%ko{yEl`8b~IUj|^>x%n#5UtoOwFh!_8u{hs_rpxT_H->NGLj}Eg4S9{#lqZaaD
zEUSBcaZo^f(gu1v<$KfN*Wctz?ciS+PKKs_V{Avk;_;{thV}DdT@_x>zWV6jEne~1
zo-lo1MrnEpAz*IGzJ;WxZ|PmnGXv@!h>`2xZ}`iHe}DczNS97tTKSXlyLY{L`HCem
z|D~j@A#Mf1UpeyNX6CD|L=hqQ#!6j@dbCpsx>)T@icwq;t-yBTAqq78#w?A6T{?fO
zDE+G^MMW35%r;wJN5JI_2X#NlCfe7J@}OD+D9|LCicjvdOmVbHJ3!DO{v=Z3RQ;#n
zx^4Du$Iin$`KiO2imLzQt=V0Jwf^|ykjJg-*K`2Jvd?PN?uIhQljmM600GD;-`1iG
zpw_fQ7UKV5p_=eH+aknKA{u1kt^NkAdl1Vk_(UJ^@?=Ft1#m!QZ^qhde_-}d-dHxP
zSVv~fL9lDrF!Cv-PQueJ4mAg<z6jnb1GNxtu=A&zS<10&0~>yUUi`xL%mIxQEwA5i
z0PP{E7jOGQ)Q@+uocTx_CI>{WtcUBWtI-}F%ZMs{S(k3H-kkE+nw&qiO%&$>@u4nL
zf9gLHR{Y`sn{N|%HN<Eok~Un4R_+U_@pU+)>WJ*iQZ*QOP~b=F`Wq$#0T}gscN1WW
zZ|V(0R+3u3isA#fx2K&kc-;&+%7VD5H^;az%<oqC>dTB<QOk<+rgc4zC?jk$33pkT
z_lz!L_q<u|j3|8dk=IAV`Bi_PoD*!m1SmFUEosMpn69kj2wfDhZhgkU7+u66c8<87
zgHW*^ElrI5AP0ETNMTgaW(iq5Nr@EO3aI<otuGn1wbBj#m^hauea7fX54IPEiy&dJ
zH`)yO5{Q1;2@&BEKx;gH%)L(Xvnqq=Rd45|j*UV0s%#=vBlbmX6JWfVto^YI3!|AR
z-o)(iZ~746%8P!1l}0K$8_ZVUgnC2gpx<oUuIG*Z%gFB5`aR*(Lo%&z{#;PZsD8U@
zb{axN8BISn!bGN2vNF&6&u_exvj+Pu#vTM3a)n4Zdyh*4@>@-u-0#+D)6fv|vuz7^
z`@BrrV!9j7WOg;2W9lG;=LyQo$xHaPR?oeGGzoah<3NzTNUGS`7P(FH%%amGpqm&O
zxpku*jL}q2<Fq9dHb-7#bM5TvybMPejEwwz8wT5$G>a)YG;aQ(dTYzM>4=eY>Q?&w
zp=Iw|Nn^FOGa(1CESGOa0i*Dqvr(s}QRdhIQv!lQWI<L)z@t<PtNGlz4fryHV6GmQ
z@0MQ#oc>jpz7n%1*{Kk*uDWI7ZPx9XR(b!Ok<={QViN51w}4@v--RnSlkzTYl)Bb+
z3B-w*?2d@YA<m<|m{P|HJV9CH#$viR?&Ye>d*Sob`P$LyOCT5_s$~9bcC&zIWTad7
zhKk*+zyF9oArce{ScnUbVPWCs@AI6kF>bT#o=t^0y2~Ik6^{Y6plYLbZ5%x1xg{wi
zC8+@3cj1&~z|C{~S=Sv4K~K&|CuC)=wI+885Odm}hn1eW+<7_a^iX-MgM-?uiw7ph
zj)u-1Z_;w9RFJI*=kzp_MPoOU38RxPX)Bn&yC*RJzjm%Ps0kwqSC9g=HmwfBQ3=%w
zj#@6GU<m};iVBpYAc#N!<qUyF2m%@sL|PRKL}ef#LXdLE5s)B<LW0T>kUK~s1OgLK
zAg4eegpj29VKC0LzxvnxwYxJryZiPX-+S}+0R=piNY-YL7f2*xTS&^(iiAv`@2)1+
zujrD@#c?wZ(XZD}*Z9irY_>F0ozj;11mw<*$26G$Bc9T;sxbj`%l1dgJA#@bo?jX5
z8;C^^i83m-(8xM4<uYWT&N{kR;#jzvNVt_)Xl2Z>BQMRh$f}zi**UfS?enQOvWeiv
zSc)lrs88^U_Ejc=0M%WL$ol=E`90ck6<i|qRs3JUB5GZf#ax8m!ClLL5yu3JQETz5
z>ofa(1(VbxrZ9htL?4K471=D@4Zq7>ibPm7p@x`0ObWE-SJbDurN(hZZq74DNz2nM
z>mchlt$4YJd_1}I*fgfP@bTx;o^QI2Eu8wYuD3%h1a)Ke4N*anK~gz)hrAb(Fftw?
z6wh)v>}63#OmxqN=Di8^jRPWE9F(DhhZY(k-wi*gk+WRPcPt=v3_nDX)Lk4L>>u%)
ztCgAKXLRx)F>&`rsi15d_E5NS@Kd7zCw5&$OlME+wj}7(qlafEM91U;4`NiCT`^xK
zi*LQVT?q3}9?b8BV`xJu@lwclMtqk8grD%VVL=QhpiUN9CY_9eGtSZzi$aErnfS@u
zd!c=5je9CP@1GN7!0$d?onn+~hqMC5RkE^!uPO|`W>029zS8|+elfXO=6(w90>|zZ
z%~^NMQ4+Yf#0~C4<<YRp&b31Kus5;PRPEgcvm*bdE>3S+m?nU4aHc{sI>v+L=fC~s
z#_Z&uNa}+&(w$d{DPTtD-1$ZrV@U+94`%k)Oi%cw=hA4$i;&<Lc4ujp)SguX3Qssk
z!Viw)X6(Ux?Nv0>g0(`0V~~fTTJITQrc<lb7`_j^I$Z7sG}<vg<jkfbvabKO>I5tB
z3Yk*>w&8o6sn}ypQUWN-(GVVP$Djg;p|M<7AP_~FYUdqcjNwv_lF=OV3j^zxPLJfW
zP;a#VCNI4rgcFq5?M{~eX+~MQ6tY62lr)7$;+h9+(0;=3vW?2!l&+G7`A|SQ`T`wf
zdFkcB`bBp}R;zF@tj{cCX8|xXC#co5mV5#B$~efAE2FQ(iqHT`A)Kt2e;EgAQuzV-
z%t4X=3HYOxYoG*fl!F$fqt5FHjJRt3CEJK%P204^U-BlhF-%lMC}UAQ2$!;Xtwr#B
zp#CyvExK{DZ5+hHVr_xFaZB7h7o1rZePV*OhL@UXR~o&XUrcg{4wR4qVPQ5VPtLEp
zgP)28W7Yxr8bUjQj{uU;=SuG;BoeSOSbP0bhjLoNyQu3-fWQ^Vx3MiicnAa7<rQAM
zJjoQOyc<hyIN~D@h6W_?sy5pO&sl)4)A#th9k6$}@!qT|Y(^@^)FN`eHiDT4M*6jr
zPwQ?~CArAb2!uiO272MtgPC@@9Zv9(_Io@dRDVY}c+kXjDXw^NtTg%86SE76Lf>5#
zv#*;uBC>7)>xBzM7mgsCnVpt_XOjHc)Z}*;GYmn8qV8T18>lD+b@yPoOgb_={IzVK
zUXqBZqg2jrHy!+P%lu%u8`&pMZdB2zI&WbfPy&IpB3t+N^sMk}<$Lw9vUO8hCiaq;
z&3u__J!%MPiJmB}$&#f3q(62W&nNL;JRueRJ&{U^tz5%MRQb|g&1YDIM=d%lC#9Y<
z3nTOr0&lg7ZWX_98+dl<EZXb&pedGr6GcO0pd|&Yxs7a=U2jT3pg7&wvbKIhS6gFL
zUquwJKC@IaJiMSGqzvo*7Y=`;9lu?e)+f;dDQ~Jg!4|x9RLIOe{n@SY$DQI6iqJrM
zOS$U4A6{~s+Nah3M*2@w`^baXVyXYjEpp8(!hS`)?g8Rwist0#cB1y@;A{T?=t8R5

literal 0
HcmV?d00001

diff --git a/tools/profiler/nsys_profile_tools/images/html_tbl.png b/tools/profiler/nsys_profile_tools/images/html_tbl.png
new file mode 100644
index 0000000000000000000000000000000000000000..0b47b6f31948ec6b6b42b0cbb41aa1dc4e3c2b74
GIT binary patch
literal 36615
zcmdSAWl$W!`!$LMCs-i3OG0pWcMI<B?(QzZ-7N%n4^DvK65QS0-QL;!@|U{rhg)^O
zU8=U&-I?jx?&*G>^PF=!TuxRD2_6R?3=9lOLR?q@3=F&)c-g~31K*%MzHoto!Pl7!
z3CT$a2@%OT*qWGI8H0g|hbN`L$SEFR_IG;Pa>CK%HT}uB#^jgW2mj%U5JblRi2({C
z3}0RAS3@pIz)(Qtw@*JY`DY=*3^YG{sHO`m{&Zv@phSHJA!K(q>S?<=n90%ku(y1b
zb(ZmDEDM&qUlZKzq=xB-DA4$6WG;{Z9apSB28^5E577>c(@Hiu78?x>RxTsTx9QFs
zEd5*4NTJ2c)7{(5_Rluz_h53cL*dKRvd}}6@^3?^-)h16<)B-d7rH-p1TxA5v4&ml
zf1v!zWK#N`#y-S+iUU<Y`x7oWKT+MncO$TY#$ws=4YGXa0n|^L@ISr4hk`$~nix^<
zd|&pjMNKCab}!{6y=dhe^es#KR5EBnDdlEDHzj($X$m*BF#I~cb-$JzN1LtPgW~M)
zZGa9js&qe%M?w>aNMFD}?wtCndFo**ljJrc(^xjybeY<;AhgaTZTJ>Gc3J`%K68hx
zBZe$AXY^aDu}-q<J#|EKh{UW6nmeOu8cq6<Kqltr2}L(*iI?ZZuzmHh4Y79;9#RO%
zslykE`y}?MMs(c%RsG8?yz!0$1CKUmUdh`F$BD3fgB;a+k9%;bjx2QsWbO9uI;)Nx
zaq(XIjiE*~`W|6qIVq=KCh8oz$;VCd>GqcEh~t^0jh{kC7|$VAiBMkt2*6f@H+JFN
zm?;Ulrj8B|XD`O?DQqyJ@^O%7j0xM~$Bd!bjqFWS&{Uj&rMC1iRShh0O}U~gkd2sQ
z(}S`0%c-#!UBH8}U*WVzbL}#>%Yhkff}3TGqeeKG)E~5lyd%4A@`qOPqxkbqv<vZM
zoPZtDwO+^<1(AfGS`C7o=$#Q5>cG1%T^e#wl3ke9kV<}3)yQ}dV!!PFK#lpa+QDf-
zvI)@rM27C7cZ2Ke@}!3oKq2_fuO$9%GT@q6aRA4FzcQBX18lHBjT}TGA{-GzJ%k2K
z5`Tf*J0+wU!A`-VJl1{qX{424=seVYq-!`)Jx1w37Cp#$2$L>pcCcu_k3EBS6q(2$
zyEE1uenRs?uytFXv;6!>;HTTY1<U^fMMM~vK?p015-qZqSRxEoJ@^Buu{e@KOg&N7
z0F4nCwg|6*9TrtcG(!)&;WK?4)_V%{I`}$NMSqSU^Ij)C8+~X7!_+D@colrhZ_eGd
z)eQ4SR<w<<njt)XJi+Z@C%@ktu5H<|i)0dyf$xPz{4Ur`-eTRnUB|Jj_-V6@VuKgc
zU$!oDL3^*qjhhoi*L~Ja_$#OXZawsZ5LN+-CV=HvD2f;g3OuZFP*0F!5J5LnH<^5L
z@mD7jX!Pb@<KEHUmtM?Gq)mGJcrr0YvgTM4@sb}6KO7Wz4oD8b4=NH96QdLJm5M*;
zT0|)|PcxJZD+DSK4qh6e*HXHps!Oj5^NQP$uaS2a$bEwT;3|nno>3dFAl1z8F8n|w
zlf;^0BH>t|Dy1sPCV3~`8BaX4KTL0&Jg_~SIG7pROw2?jmB<o%lW06#8DEjeL$gH3
zMJtw67mtuU9J}<%RqgcKvZH%jn}1sqw`zx+PtNnZ2VyG0nF0?5j%nxVm+6RUeVk|$
zal;7(bMpH9dfLU*fVO}N!9>Aq!Gdn}U~m!P3I4c()XJag7e5VhHYv%G$P3ZFng=Sh
zE5u9&)uz|BGJb7VYZho$eMo@U{;f$aV_YVgFeIcLSsHFpY*BfrGZRsoZ^6uB$`Y+n
zvuNMod05>N?veWFaN~VAH`C9;%;Loogwx7QYZhriWzk|;XkjzQUBvNSWX54;)6#m5
zv+5?GlHYW4>R{*k{bOolOJh!>TB}B@l&3<!9Hl}Lf04jm9<4-5UdE4B?Ivwk>910@
z{;Tlqob8^E!rmc1-(F>3WMAMQg#yh2Z=hwM$B~kd)I%OZoI{W!KZ)9irirqMHbvR@
zA#O2m@$|9D=%I5Zd`OT?z%8sUY5%<D6k~qJeazkO9^lS$^*B0UB3*}9$5|)fD00Pr
z1#v}jrHbi+X^*K%!%5Sjv8Z0CdD_%wOK!VktE@L;aMQQgn{KFW?!6RZuw_`cSh%p;
zP*FlN?oxGv&sDkt(R$-@;-Y(ue=Kr)b4&W(<2~H_R{Te7W>Hu}(ydz3*@+(ehAr&=
z7};Cd1=;?rR|i%{Sx4UEXRnS8uhr+5t57W`t$ErRTF6SLIjFfDZJ6cym3bQ$o0t{n
zmF82J<s62nw2~3h5mka@0+JX-0`zRz>@Pf;?I?7*RR&eGw(e)*XYglYyqvs)yw1Fa
z9WEWi-kLAfKHxr@FOTXt)bGoZS$5CXJ109y{qRr-<WSGq|B%Ujn6CxXS?|<a3V;!K
z6bQ&I)I%A8U&VZ;@1-YW+HR85G7~p5-k+i`tH#jJ+L%7SKkwN%BjqMx6<dyB{`Ntv
zQEX6{=38l0df4qZ+9-msnlN>2M;aBDWWpu9(3psCTqtZEe>^v$gL@*5F)?TkHLZTK
z+g>D*mr6~hWJaZgoA&33cTosa;>q5Ryf>Mv?nhS-Z$<BC&(6%sOwn>ZbvcC{4^|!W
zOx})HO26h_Aa~ci6#u3vsXp{+EkXB}Z`NUoMiOTnW~K}MiKd5>hY&O|De+c4zfLU0
zzFKKBv#oj4A0;(1yaA`7<v0!scoh7BfLcsb7SpLe`UfrZ;?IF@^!+SFjC(>hN~hu6
z&D;X3H^vmhtkj;#X9;h^4kHd@m$w*;UuG+@v<9`*+u=2mewKPSoE~=1^jLb$;b}-|
zWHjF1K@0~>2Dl!5pKJJF@uBiF-s7*k&=!&d5_$PIkG&gZJ_DcaxA}%&COwr%x)II4
zRi&t<h9r4yx2@V96K1~d@OdvK)|Xh;z3jGY+G#>*J~p;k^C(fNrt5OnTmNe8JGnmj
zy|i``wq{>2(pYXmX<p>E$M3G^ens1^t?A-jDPFQ)ab0XRU(w;&a~jUG%){fkayP4L
zu9T$Y{h0Q4m1VJZ*oPCCUhDnSo8u|_G-h$_0bwg;5wa9U^1T<wCw6wbPseg%6++f0
z`ZqlpN9&ySD+6&NVm1UfoQqnvD!MDV^KuS|c<6+l7na>s-u;l#O*wj*#e5~c4tJjq
zOEyJ|?bWx`6eXwl9jzSi%-^O?XvS!Ye3;%&*0E2Rl1wqiY<R^uRozaP9-gnZD!OMU
z8n+wsZtriOJ{r(2pTeH(w_w>_T$1NVedJ8b+M{dL;&uFeFmzBBStc`gT*ac()arBS
z+Pe1DdBK%s)$Zqn*VKiH)ZO!`BaaP_byJ&-cFSmscLiH<Q!8EVne5y5SAVW$uJN<N
zHE(aaz18g(7UXK=Z-hg<)Sf?Hi+7Q?_;8?)VaDEjAnEwxz23i~nC|Z+lvCz}BIGD!
zC-Y2ss5~t{KBS@M#AVQ7c3}B@_Tu2sxGq}GY1PGU=X^+c(w*x!=nK(}(DC)GyQmmi
z3Out~YlsPr<nze6@NxKlA2<{$M2Jnu#V6wH!J77j{nq>(agmZ-1+GizHGF?7Gk{rB
zBnA%U%>Eh9j|~Pqw-<uH9S+QVV<eKt`p?K)fH}llxasb>c^%C901jC9GB~WLAqLe!
zKJ5xTKWwSUPssT`(apN6^Xp??<61GW^ZoPlMmRhVL5R2dwC2~=jyG)oABSt3onT_7
zaGhX;d}08KRWVkRFp-e~qXOQ;f`NycgFylBz=0PI@B#yaj1K{W0sf-_FX3E>|2hg@
zoeTNj@9jY+3MdLmNC5v84IPY)Z5+*PofsQBNPwnh&6U)g)MTVN4Q;Jy^^I%|jA`Ah
z?Le1+al3H>Z>^1;^oiW8t!x}Q-FQg;p1}#c2YpRPLiG0(Crcg@H5oY~AzKGyB34>@
zT6z**cp@SqZU-Y1P6c7nf0qNFcu35goa{L1=v-Z0X<eCUZ5>SM7&tgM=;#^g7#V4R
zGiV&$ZJhMoXlxuw|I^5SwIgioXy{;W=VWecLj-D9-@w+{iHC#)bff?N`_Jz*b~FE<
zJJ~q?>lV;KI?xe123mT$|85(&lpFLdr<}Q)v6Z^8xiv66z&&`GSUI@=p8tOx`JX%f
zuPfF5=SpTK#{Yfg|2p#jyi(cG*g?qF8n~qs@BciRe;5Dnga0n%rUUi-e~rX{hWYQe
zz&P{5bJP9zGvkGCfQ1PH1LFsi5Ef8!13%7!TUK?v?e8>%R*W>l{x;BI#GEL|U?x5r
zM7@v@*KIV222Cun;M$k1o%~tzlb{8);pZpX+DNRC_3@XSJM2BKyN4Ui(=&IU@vG<T
zJ&wJb$CuOgwHKd`JD>5p%(LdTtB5_*&Lk%t$Aa~3SayhLKhkwrW^yP@Gs&o%M$Z3y
zW`_vzW1<)8j`@3(9YO{k+*D7dw6(3~-xvQsKHODx{A2ziQBn3Y=865k?nLv8ptwEk
zr1VrYt)`chOCv_czh0(v8d3d0=lHJ?X%Ja=?R?VSEw2<DK}@1_g2DqA>q0H2fMS%=
zo85Q#nZI+9Pby~)HLGD+QEYN<T06P#;U3N3LlWzgGhEVa1ueQk*Ku#a;q0a%mb(eN
zT{-;7q;4P9&M%hx3uVANr%vUVs<&L+ofBx)4hqk|RIm$MP1!k=tq=4OT#V=o#8_7?
z$<D!HP}g#^W6wujFZm)y?mE!p<AosfIv1$Gx$u3v^WDgDU4`R&xmK*W+#B9Tq$Z@|
z(0{w{=M!wQS?Q+pxrn&WU^11JEk%KqDmnmKa@z?0xGSYf?dR{)deRuNqvbG2{yRR@
zw)2J6>2UgU;rON9ir3X>o>1s}24w41rzx2mf{&V33`l!EUqq_fE+y4m7L0q)q^MDO
zU#_PlF*K`op09G=tQf6Xh$>r8Tjq-bVUXXC^W14wziJsLDO0N}6Jl?#x@|?3+8Ekw
zIoMZ@a;0gSG-&(2z1(fwF4@l68=N%E(YCsyQ_4=?z2yeJhmWA^aC1BV1v5-bstawZ
z_aHY27l_JpJ8#G$2>gN}oH?!S5T4ijE&Ubexhy*E8b9M?bqeWJdSVKRc(Uw0F7uTZ
zXDk*ArAVrz?4Z`O4$q$(A-Dzv1XQQCeUaE3TQPj^f$kykz1|x}Rvis?zP+5O^&;~<
zql}l`1c93!RW!W^KG`*}GT64>E|})vj4yc{ReXd-!bxPaZh$86zMjxOp0Al%Y->9i
z`rXWWX0#emMs{xN`=mMVIL^}pi~|nV)KEc+j)w&0&XP@Q1!LtX>k{?#=}PPIpm&Ey
zmj8|r`Pca>-<PB6$HTJFn>k%w!*SlH<MS2Qwcm+v597Yw_^!*Sjq?TwNKYrWzGZc5
zskxB}?>=f(M@|z{*Z&bTE}*<{7^eAnb>1zo+%sLG*rRTmrF_)bFQ`b_wr<;gqXJ+|
zWTB_q6Ges@SNOUe<17wGlN%OVp$JSJw+);Uhbl{9m4$ccHNY=!8q!Fr;Am!_+sztL
zQS2!aBn`T5So3+BDZK;+OXn(W+oXT$D_^_HP5~1Kq*Q*~#xv0PoYLFlTxY!^4Z?_J
z(~1j1Gji?ke3N;LvI@ZzTJ38XNa>@1v(__SBryoCgC7ahn%wcXZC7ImmgO}euOW!o
z-w%6Ib0H{Wcpr<$Vl^5pFl?H4FllmF?~WG~eT%_8ZVoZbw(>#=a5`QeEh~~5SAa>B
z=lk|rnl7yUyC>Sn)ctnV(>4k;k$PC>w9to*{%~IOqO%ZMNKfMCNiOP#;zR79rih&I
zXD58+<uH(VGKV>7zZ&B*Y!AhEQ3DfFvMSa~GZut|Muc@;^KQzH!D1k}KAQVFj_`tU
zr21oFqbJnzLHuH!-FmU!FCFo7psByzkM@atv5%!BA^9Zr9qVCKsXh?6Ljxo%SI-*Q
ziJ_%&uST&3g^@<GtzBd^1^uEQ*h{{ge$;l7ulA2N4QJg|oYK50{vOBvE9TaLP}ln=
zN%Uh+im9Jz>eJQ$ljgU5LweTbXr5cOB!aHYb5&jM1|?LMY<K9>DOoO=980Ae*A-_~
zx5tB`9@ZtRd~f?@`~G0+^be22i@!a)u?*b2t|!Did}nvZ`QEhFz*`G#o9S9jjv-tx
znffAe*=#yIHJm0y_7J{cbTPS~wclCPieRb>$FS;~v=&p}sn<sX$%5SY10^<!L4fnD
zI%RfKJ$hVMyluw=#lxtrPoNGRsyVe_HL#i}F1yLSAFn6a<GKu%%hl_ct#*=BWrc&2
z2Zl@M7=N0lo91|{RDD(?C>L%#`8M|>c#Qi>8Yv-^FV0_nd}n8h-QCXc@ov@gYmekh
z`|INo1M<#5Zq}a;K49*%)bgHpK{8|&3!y*gkm~3H0Vh3~b;?B^Y5yhsL+RX-ZKpTM
zQD`_K4ns_NsChOL>pW4&pkM6u{)AA~FHE#R<lkwQ>M@cbNVtU;dhd2TDMY^J{1K}c
znf4t_{H$nz53I|Q-MUxeQI5ULgaKia!aVEC&HkgSaX#HAC8phnFEdr&-%r)LnS62u
z(!^Jz_}Br2ij7S4CSdM-sdO(bMkNar=ynuca_=zO%)QY2jAqD9`MMs6JZp)K$im}|
zFxV?qB>~yUt=hZLAf6LW|LjI{s5Gj2T!r>?YV8*B4|Li-S$Ei_id07VC=huYc@66c
z@dgxO*S_*T2%klmVo~=;VkcPD4pO)u4InY&m4v#&kVr#6y2)m4WS(W!Kat{}`8=v`
zufz>%9cLU&6<IbHyvp+2OsDqK=XtFz0U?K`FZ@=hrluyMg(!`=m-#!1^n3PoKQ|gr
zwAo^vWK8C<)d`%qceKSRx93~lfuhlP*gZz0x;>Ze<2dr0<K*-Ljd%vPdJ3tCg&TvE
zIf$3l!O{3!AucD2J2XSLal$zFGgUqqMktd`#fS7QvC%V@?p?|dv18@(?~g=B9&>?6
zcSMbZLC%B~iF7ClOvg4!*Ww3<66>W#hW*Q7I@JRdWvU<%RHG0I#^#A9Lrj>O^I@Fz
zcq!VFxdXmM<Uy%&OSkUaYn;L4HDy)p0<p`fkhQnhI~?CB8MXt4HNwO3IUP@#d^yF`
z(f}~agc*d{?sCZZR8%IpAE^02<ZaQ*i7!;@EF|YJ$yr7p4@0EY2yPn?fSw_OERjT2
zvCZ3pK%hp+fxHApO_8cV&SkDr8^O2?H9K2G@r|J$54`||%+_RAJR9YutHtbdDby+T
z@zMxhiCU`n{SKXPT9XBtTedQG^-X(SmYUoXi9SZRDh0E~Du${SYPQ=Z+==gOV4d6}
zhd>szd<^2fdaY%8L9&Kq4A<$;e92|Acadi3XY6-K#uKcuv@bRnZ7(lpzR2Qk-KjHX
z0nDQ7X<VWOS22V@?A7o{LtHYih{^H3Ri9|%HgqO7)ob8C>w-j%Y?kvr7YddSbNiLY
zYP~qk?%*H1#Wc>HzwxD_KAynwZPoj31uKXXu>leiD=1i38*b(uG(s`UH`F7)Sfqf$
zTESq*U)4m4J6&LsaY^Q4llc&WJP-_j>^g4?I>dQ!vd@sAErd0te@m#PqdHob>dfLs
zm9#(XW(bdpm9PAoKMs9_qnfEIE+(3FR&x|45AlF*K)N0vje&a9B2!nsGt&#_!|bkT
z2OgBQ-X96R81mY=FjhqA2BA)aIE%=jUiC<uLj>2uqBA7d=whm<lJ0AwMXOYLzmMH>
z6w%T+^6hy9r%O#$$?}KDUo;F|=y543xbJRdQ-*S)$A4$BBLun&wL$KnjgwmoVhkJo
z&@ynaJ|U`VU)xDekky6H(F#c)%h*}QR>Mqmm@NOf`OP9uh#nHQ@N&crUVafihWQ|k
z9MhIn>T~^h27#~|*Va4CK@DGA+cx_=i{baCSwmq0OJuMJU!1tNV}+bpBb|DOtvSzH
z)Vi>lguiY2=+iFORVeRRA_R`WQD)}K<H^*D3<cVMATb;`@R2Aorq}P*<D0$6Rd|uq
zJpoWidUD$C>(g14C4PD9k0=bax0ds61QzWQK4|O=YGPDf-<KQxqAVAxqzav_BLr6*
zn$o#2uKTHvShzd!?ObhNgU@%SP$?|FLrfiW7wypdN=00=*{yfg{-|2Mo$Y6dY+L*^
znrnquNBR<*O+8b1GS+1gn7;iBtSbzh`))%5*{1N7u}nn$uDV=*lw}UFtfDbp=i{U3
zw~ynnq+t2(SG%KZVH#T2(O=4|1(gVEVzr8SnvL^^e%<?H6H&>KDNG}eI9N(0!V%jS
zEXiBx=Uk7(4-Q8_UI|$nAtv2>8SP|PFtZ^gCrB$C_Xlt(mFnv26}Rp))#)TT!znj2
zs_PN;ZlVUD8k0wUjI8#kI3m(Bp4m|&{5h39m_RAE{EIL=DM~kwne+(=4=j<qVg*^q
zx@0=lf)GsM1U)c_c(s3aGi(Wzy?(HT(@L%G#ul=oVWUU1-!4kouJ!RO{pspH3W#E=
zj@)OcGxYuqEeCC?X}Dk*!(;HdFvOq_DbWAE@tr{hT)c=L&m|4iqo<+_{XAnT;{f(W
zEiv&q5_FI=O8nT%-CEU=n7E|5i3C@joCZyrkAIw(o(^JhIbXpr6SP$+M_CFB-mGG*
zpuB!A=eB+W%qoG_qH%mu@Pe}k{r7Bbmj&`w=sN9L9Cci36OZADnh8<dlEc_nfkQ!Z
z1FBKA+{|hnLEUe}!Id{CDariJq9Oy^LA(Vy$EJ{GB|gFaHOap`mH?c`A18o_bghZ=
zLkQXs3r{$`94c^k!9VtJ(&*>n#N)st^+(go-P<zlqOD2i?QB(eS4C1%`+ODS(O6#Q
zWsb=lZe)}M+|vZtkh>^so->j)s-2#XH>5zZa!`H!PKbiWOD`u_EpQuGB5aRDrkQ<-
z#rznbk;3wR_Xq)@&afrgYfpvMz7LB!tD@N+>B_{bh6NbIfo$}#8j8ldxT9rOy#`o+
zXOEFTT_nD@*(2FJnu_YURx7RIvy<{R!D`(jBHxYsvS8~JhOs9aC=EKw)3$qdpoZ~a
zEpgn5zdJ_}<>)npVi1r~kwhiH=+f*oE;~pyP>{(IBti{;xjZM0CY|L~laTkdd8ij6
zkZdqWNTJ{TeOdJZ4pJ$Cy(}glF?XVNxLd?H|4whNUP`I3hIWxMP=<u!ieY5ZVf(cP
zZOd$<)-MazF>)@*kj3QQfhJPemY6N8cni&4pG7uRy-lVrG6}p?v{{9+A1FA~rWB#)
z)<czZ3s3`7)w0zM?0pSR33{%wJ{+l*1wQ2wmS{pp3R6&Pu&J1-5e1Z~o9>p8>ZW3$
z70+mM%Fgy~zS!w@gMI%^J*PHu@w~u;K&oUZvuLjWJz44bB)ua6j=v@SVeCtAYwAtE
z%enN0&Ib}|cJrokxI@qW_JPL~3c4UVmAdVXY~l6_mEQ01v`QkYcG&j~QH*~5Zq$bE
zDu_(fT#joJs~iz~VM#<PBba?pUws!?P7)g~)WkZ_%b}*c`<aq$Qi>1Js1Yipv)L9U
z2qqPDIp45NGxKK4lCxbEC^i;noj7_Ea_*SET(n(wT)h7i;(6L!Bs%ML<`@O_j@Abp
z@~F+kXA9iC7;UprU5f$1VGbii+Fi_qd4$(Wo|COf!nL?Owrzyub5?X3I@3=86F{2;
zJ%lwtOfl_6^j{8`v_5POBMFU?23C)msORfvb(^Rk%Nj84It-KZq{5*qk-}=OQ*f^_
zNwn8P=3^<g*6JA7Ig&A8V?p*I7q1d{>=P%2$Y&LPY~Rs_6ED7WFjW}GI8-fXHiu#I
z9byzh8#2L9B0aL19pAaeeHm^vO#Uc+7$MmZ5$uu1K7;ML7{mPTbzS*e?D?-`)AuFQ
z)AzyW1y+bEvC(R-qK_oV@o^Hx4~N2#@vNf!B(`A{%|q6z9gXOHcVUqjdP7)>gQHMG
z=A>u@i<FO4u5}^6xa2*Rpq`?sfzPf61x`&yI=N)}FEo3xt538V-VWbMn8*LpJYs=k
z##ar$@?v5oGdzu3uuc#1Ka@_S|3N&t-N=^JQ8R|yvR1olTH=E5JS{JS<NDP1XEkP-
z^dd@NX0ehPGs(iR<Bmi7Yu=TvgeeCL6I3j<^mz8!Y%JIp2`r2O$l?$h@%{ao8a?4V
zn!|KS73F$rdCPt31P_KH(Me%6Vb^gDbWyN!cY#!ay5v-yPHGJ|NMTl|PUH3hg%bml
z(Mkdt3|DHEsWqdW&G_UFhpoe_7LBE(j=esce5PtATjqo-$x_%_{s51MufC57r7&{i
zxwG+EL`vET<h})yc5B6m3yUzyU#P^CG_lfh1qD`|Ze~>b52bfLK;k_7$a|tWIx#U<
zRqVqjjV=$tQ)eg>Zn3adeO$$9jT-BkY~R>x5cRY5zh`09Z?sggY_?W+o>7M6D;gqQ
z|EWr#NLRxQVa!A?9d?5IZSuD#yjbolD!{0`GTh?e`CAC*T0!@{ddf!gvk<S38?oGk
zH2p?3V~4t2(r43p(MbRpk-xl_+!v^!<SsQ`I|nEnT=3}-3ik)5h57c48!iZEA)*)F
z6h(yv#K`U!p<M@}+xi=S{!`9!zJwe9+LQxWW<-eQGsjsC?ch|5&tLccHK+{B4p~-u
zdfxYsH!~9H!cHanU5UXi*V|Kjf)$l7LHUoqITXqL%|y@o{r5|Mv3&mYO>pzZz#&y;
zl$Jelcjf!%`O0F}-jD`!Rp^G#Jy(C5`xB31Hh~zaoxafSRX56tz_#jU$PQ2(Gf%c0
zp6_bQxZdMM8BIxWjd~eH<d0=>9i48bF}Vx=y`Kg<1WmgCMl}jrC+EZi+50O!@!;j*
zJx`qhMa}LT$mkophmC)1Uo5^q=4h9dF1J?e)&mAzb@9c0C2UN;8`sH9nd<&RUD4~g
z;85j^i+2wa+ej#LwkwzO;Sb)<*!^?)cQvG(1is-%vlSTx>b6^%k9i&5PognB<-ay=
zoPh%RPh*XGWpgbhjRG!LSAK`E!7@<!cMHJae{h`%A)Zuu#*Eutb{r?q_|GF6h4D{5
zMwVXYpFYo|YA@9hnpzdKZ9QSVaTdl~RSPB-IV-ERDYUUTACTnc<8GJDV5KQUo!d!=
zrmJ>i2nIoFnJ-7CaM_YHXNq5Wl0c-BQdN%|O_KHsdRyscX9RQ5Mx0uzmDODjGGAs`
z*|}ODPzIDbr@ETiS7ih2kiZs=EIRWApZ@B_`w>z{#B+3DFwGQTr*FM&CeW}%KgRIf
zrdImH1<G}!Gb)S&yidhkvkj&kO{3-9>E84eAv7`pp2zimUbb&IsA!n^4bz9;EgkUl
z;{8giEAr+yoteag+GI@|j+@1ZCT+4$iviQ7ybZ8Imx4Fu1TnuC^wv##H5MbWnF83I
zr-|$}E78b50=!-g`BnbkNi0JEdRBH4(T_->-uK~>t)~4Pgd_P{-De+ymaIsz3bYB`
z*8PX<<c>Drc?dW9;X_~?e=}^wi4kBY<ZXTk&+l_y#OJg*G969F*kX`7n5)XsQKYy#
z{yP=;9r<%bn5qFDfhO!x27A!w>7*zc1)1x(dcF!o>v7ZF8BK_Mj8dn+L{Zk|#|end
zR;x3Jsz3n|Ci(!Mhxk1pbKH{Nr|CQh-ky6e)A73PYYuOVP6(9gbyMdHO?5WKBh*Wh
zVWxrc{ls5R6Ii+;O!S=Fz>tK$Otv^5+0Ck(zEiiHtZrd9>uo{#PK^a$dCbt=bsk7R
z?N8s}1=S-E6%Ky^&|;Qq0E4wF)%bMUTBZ~VkA^`KdB5s;HskcA@~43`AHY1gy-tV`
zenyV~_t^w3SAKL+&<bR*`h%QWfh>>Ptrcrhy{>6dQb0|`eK|ln^vUW4uIZ-v`u9g&
zv)Geu27_K$mWjO{vrTA3EEL#4J!!XE7C{Zcm6YUXJ^ro!m?koR&d;X#QLOOA?@xLx
zf3#I~JW5X0K${-^(eDZQoZoiZa?}kFeTRfnVq`D9K!?-0A!ruhQHN6iK7zmE>GAF?
z2N92*KHK+lkkaCJAVx0Y^Tg0stc=135TU0MO!7dcM(jXiscgX^&T*+n))R{S)hMG>
zxPxhmyY_u@ScxY|Y47$3)A;mdCs1^m$^QhXLP`U#I9S8U%TIx8^EHNB4fI9AE;=zM
zHOwv_gpg=lcSq9TZq(WDKDT<CX*u4PRm><j{&ekx5zB2RmRq{xs5gQRQRgl^rvDu0
z5x-DpN=cvSMRwR^bh_M(BhtYf2Xo~r{)5d}?ZDv`s7EPlk3y4aJZY&ErS^(m=X8B}
zP$46*q_Ix&xWil~MY|uu?L!zv<OwOB>1#sO?U}o`a~$?Y(hLPgch#0)Iu^jz**0OB
zNueyBc=y->6XL4L<p4f%%leYokwLO5wBRZ(w1LS7r4QG<&ZT?a<{M#e=ty%A%Xe=j
zYx|&z=G}A`Y&ld7T=gdO!LKqq4A2bS-4oMcLHgBknw14N{}#>P0GDpDHwZH!pp*=*
z=sR+pqU9jbU}e-4V_Df;ShAYjwDfmA)}q)!t6*x)#9_gm12|u`&Nq_t-p1<bqzT!u
zFB0w+U&Fb6T65#?bb29PM=XD>u9%Cj(J>kq^^yIn9n=d$4Vma!Ry3%v?<6W}kf`{K
zD}VFjOHf{9Sfv)r%|`u{;lBb<#W%}jbgQVJkimT^hI@t;xgbX0`D4BjJ31bScOw5M
zgyTRMOX+p&PEHcU9k%KICtay$d2d%ZK-Gdogt}c~Z6IGs*P=tgP>SVFpbTh--k8=-
zv~+)QfjW%jKYhw<;8!I!5gD5LCt4_B0FpszDOVfAKb=Dz)yx4g5`W&<x0A>dvd?qd
za5aQvVN1@{-?sR%cA8Z4doaMj(G6sWGH`Gm7BwY(L&f0CBkcPA`47O}6}H~*S(Y6u
zmW&BqIB7?I3e+WrK%jSZ0K|$Aq1-|G>I~Zs^_2{#vsIo<-`5BI6m1tcH<mul3lK9I
zw!sN?)_;5SCok|+;g>9i=W&hL&$EuF>F{k(5jPc^`~pWQn^|nE1tRxC&$D?vp%-L$
zZ$pkbt4`&xu1o#T&whK?eBF{4!!KCdb3R>G?|HmEDa$T2r{o>z>ayFwQ)SS)JDmBl
zt{}?#iX?_-FDxyDw^}L|g;UV<xSjB|k9+$a!lBP@nrSKcHyrDqZ?8|5%l(s5bQPWw
zh3LP|?X7$2NLL}c0VN|YI(qNkd5Z&u4q{{8Fa~M;>9DM-&rNH|3ShPM0d{$gInk8X
z*?@%49_aj|K<=Vz{@X!O4%&ruIakH7i^SmFnlIs<p3i<>sMDSEax50U*ZHrB;DPPI
z1ZNnxVLDif1PYnSALgp0IX;gi#&cUh0xLL`n)DB<Ich!O%!#&)yrRr@BRS=dk@`sL
zQq|9bc;^93OHDfmfb9~|0;yB`V|R?GVfy`ej**2jTdqJr?tRXzF5}(#R{woN51c}8
z>W`AuoyEcz#%3T)BmfMcj4zPT)S`C7x4yO`+v}vCdts2cy!79U0)Z4i!4JLC(aycc
z=56}7qd8f$D9cFY%RVsm3s67Z@_roFzCoLJ-^==l&uLInyB>fbD*M{c_bOjdgGEGX
zZrl6AIK?ef7-xeSO+!Bz<2uZByM9U%BAB?e$xw#KcN3TT_~%x%0Cd%#Mr%z3n|jjH
z(iN8_8D*9FBV^xuAuEK|jp6l*I~2&B-qRF%mlo4;VLUCILA|P?VVb7QcQak_{4UeF
z0N1U&xrIX=D>;v<O92kW!nyIF@tl3Y^9`9fX?o##9@bLiHtIu;bqh9Qya}-fplm5k
zewbrn7}W7RDH!hoR24!aG*klL=L>qL{XbZ~1H_?bsqWj+sDE<?hFI<)c4|qr9<v+$
zSOGwlHEqwT93a_DK|0x}2Uk|L-)i93pZg%z9C2BY`_RV&#E`tQ)K4yKHXE%d137P(
zqgfeWH+fcNV-Dl#Pja-Mt^$#H|Eya#t+29bq_M7v@3c4%%B1!c81_duPo!1;f(rT8
zzh{x_IL0AXgIe<0?ki=MlL8Xm^zk}!9%uXh_Xwr-$DI`2U29G%4cMDfR10(1m?c1n
zs?IK2S*3sQHyKII6MJ(6%|W5uQf#a~Y;nF=B)n!h`asME85_GclXHJ??34N_S(o&O
z0UWgY=gZ+ERMBtKJ&1Us_IE3;p_SULrBdj#+a{yw1qXK4-JGZg*nfV)%u2;V)FtP=
z+j%C&#;TD`icu@u*y=Jcx)Z^fwX+Zkk!YZ&CmRH4=1@TUI68^M<6sOcl^gGTxz)VM
z>w9PEag81+J00YX#GGo!f8fmPRW48-n0_Ib!D`JM$v69vSibCV;tfzWb6-)4J)ljR
zItEB0=FNAvas>jrCBIbs2!xBrnT;rbzFF=Tc`H@12+H?h>V`>~xWCxy0T!A~iUlg{
z7WZ*=4>MvuYOz=hfk)7wK*7)-VfR<N0)yVl>zzq}GRYld5IMt(T{og($i*o`TJU?X
z_^zyv0?*fApULa~oQR-88N+KcoU-&o@HB!y9~O~+9=sPI_CdsJ((2Dow7Q<z0d#+8
zQI3yB6q9s!-+6ze`{a0VR>pO*W}z+@05Wz^iY=f+8ZsunRJ9otrNyXf*#|$4rgOR4
z@Z$^}W~pJ`8sY;y+LR-nI3@`+4YX`Nq3<(8HMD`r469JnO&y2{<&|piGN$hl>#ogl
zcf^GJP*xfuf)@`RpHi?F29+umO5jnv+cQF79^24co~Tl!PS8^rOEosha*aAekeq~a
zWYA|dTF0_saNeC4l2`!F_okl&kG^ErgLKo6gIhnwc?6{>7Qrw)Uh?)O%|1@$OP~1P
zu=S1PoG<SC_undo^kg5P#iqmPA)&<$p`9y!4drP;8)tAR(nj|DAe~($QZzmk3F)kV
zXea+##j$X_vU!PbuG*qA+_P?;s6t8N{QP>?IR~ENHeF!gZ6?8lf8rxyg*eYhqEGzN
zU@<ckaBX%XxmOXFi%Lho?H9W)^G$4%Q82(*Ft(~wJWw9R<Utewoq;$5UDs<~S29Su
zPxOlRxads{^uv2%KyEDNhES&Qw{Pvochzb;f+6&x&t8IqL_Xxg6`ryvziuvQ+Umo#
zwJM^b`HjFKuviBU+t-H!mzX@loI9Yq_^F$>r4iW|dw;3HQkCq^H~+j1F|d}<f*Q65
zRnznF3D6MMdimap3J?<J!z2ykc&p;nY2`Yx7r?CNygrTcXPNXSzRT}X7$?J%(nS)R
zCD51CgWJ8fT<n4{$3_^U#)Vh|R96A%^O-9a%*xvrVzdTS3#kFDqTnYG=%tF`pJAe(
zjRUb+?T*qpQ%8JWe&4V;B7gq~xX#Vs9S?s0g*jh}wz*ffXD`RMGT9Oc^SX#k7@8iG
z(1xO9-+dl<X$nL2^*d}Wwd>Jqk@dtPvGNEPlc@p);{9iQ8ht{^>}Ql!&C}@Fo|ygo
zoYKri1K~%*8oC67;3vsG+c69V`De!zOi^ql>Zj%!SXv2Ma~Tbwu>8rT!LmM=@V}vh
z<^z(ui`#abSingexDUPFrNA;Jq5qOb3iJ@6C}DX?JmF`=0EX`YMm(e99qm&-<E532
zUOs>W!e4_(?u3wW7?BS(Bo-)Rpb@}d2KCYg*7FDxFMpLz<h(uarH0fqqEIsaRJ-;P
zdqPRC_lj-(jj3stX3(HUT2cI=L92#!`W)!F&Xl_682eTv*$X(m@!(>bk1&n2>Jstp
zl151R;C}zS`f%|?Y$ih5!;x5BeV|A0rYv9Ejz+4_bC((-#-`Z#?Cy1cGska=?*>xa
zo^;l9geSKLQ(|YTSPmqJ6`~HjE``zy3*SZEymiCCCCdesN<Cz8k6+zH@4M8bR}4A1
zP$MKxY}4>Q>l`3AKJce89&Z`(Vinmw<f=>WRfM^bwHI5U$MLh^zYtmUVx3U}h)NE}
zA7FzHw7$QZ!y}H^1ZANLdx#rBR|LXER#cKrDv~hW31jXxNXO}2Vsf|~&kOTDkK(FQ
z$u(k1>Zutp%Tt+GY?{}ldQe`<Ta(1AM6F0#D6fyF&F2MyYmH2z$4OsdHUOm-r9nXo
zTO}_f0?)Y~alESp1<6rFxR+EBSNwKk0>0?+Li|bp<``-7!Q>o0W8D3ufX^wAwXsn@
zUGt_4>CA}YF``!rl#Au1m;3<a7_?TUAFidv@(1G|)#@%*rp=48Fs0w6(rI^81BDRx
zc^RFcw)v4$Qh6Z0q=W0{n#-MElgI-#PQneEG*Fnux00ww&L$$M8MEagaBBdtb<Drg
z$?z9n6-Nfm2M>YV!?~`*ok~mfPivAF;8$E{ZHmKa>0eptyYzx=OC~L?NB7>p$busO
zB2a3hsj>K_t*TbxEKaFeuS44ZL)FZf5VZa*n3p<L|KW1@vrs;1Li8kNXm4Zz1du>i
z&Y;BsoNXxSpK92|(XRQn*+GyK=Q*vsaP_>A9RexTuM7qEy=B!P2<8L#3V=<d@i$&@
zC<-{a`i|${2Aj*uJUE;)*oNYsOZvwMMZ9x-hq(ph(Q#{V2IwMm|7B-2Ukcnhwl*8F
zTpQT!j&XG<M+^g%AKY30ev&e%9w=?;!QBFt2Y#fn7=TAgJ?_#;1`T}qxO%JYgx8jh
z_(`#Jjjff|*=Wvt$-Pr;QSb(1cnf4ZlH-EP^q+a(RO`4gMt0&53~|04K}Q7;U_@L~
zq2Q3v#pATg3)D5eXc};l2)-6tTZ-0Kqio1P_OSm2z5czxbUb?l<c#Sj@GSReV{KwT
zkPbeSD!On41x#uH_ftifu1==DY?)dDr5po+uCmpKFQ>c)sX0o};LrIR64+<c_pgh9
z8bShz%i~f1c!M?$z2Of-0IOIi4F~Aq=BwW>!d9wu`Bw1m$GFZARdw8rQRW!+0sL99
z;aZ{Q?T&EE54ssF=w=Xx8-l67?oL-UvnVvErlkQA*))~IvGCnPjUcRE)TfR%r%<np
zUxm>^6=s=^BuN{&AI72CB{6(W-|@W5_3#2rH8V@~NxwqIa=b1jPzWt?l*)OBxBiBI
zLeNN|s+t@C6oV`ZOkAE~kt~&Rmium+KQ^y|3SesrU=N$tIKTD;Y!F98RAUS*h;!5<
zmpPuN9Lmahm4d&Wx1!ier~9JMnpgdg=C@6<@C)B<*k@#2+WV8fr(m%HaH7iDtOv0#
z1qZ%e7dWQU);+)_9I$v>P3H;0@L=pEdr+(7fDB&iu+zvue-Fcsf<NkDonug}X5Kup
zL$_~5bJ1qHuDa(bm#J)QM$kpG(B~333=p$)_eRveuLm$ic8dinO#GuiEn=nRT%|x8
zZCS*ldY#E{hi5WA2x%-i1VKRRHv(g#z`U1~!qp3N9J9OM2nATRd4V)s1qa?mv-~J#
zwanl?v*$1Y@d@>#h_t^X1JyS_rfSY^^Eyu@iTmG+dVxswz1U19#gC=htu8gtc%L?@
zQ7L0ZrRdt7s+oa2^W$CU#a4e=y$LuL96N>>*k?>mfDP#Zp(S)kC2#RA@j=ne26ajp
zK%i`uMTw1!OLc@3LZ0s2wCCgo3|`=nC<!5Pjdimrnw5VckTIx;7Q>S#mPu#+T+QkL
zP&&b$XYHy|A5yNJsnS?*lypP^V^x0EV+|Mt1%93j%i&sQXLe!1r#4&Cv=zWlroaCu
zZ)iQ%>1x6_MY~Yi3Sb@)p8-#myi6m2UV_SRcD~H@fR-^3*dWVDL+CMQ)mxknzK4MG
zcw847hG(}o!gi$+X*zCu<ppCZd=W%szeL6SCYJ{XF&ha~V!q<>WO}@B8EelJ<sFZQ
z7O)B#kCzR<{x<0XY7*auVGfx3#fKePE{hU#-I$tIHCTI;ND*MAYN-f^nxt6hYl|3*
z=N~cXrNc*sr6y>amcOIjrZO0I2O!WEm##S+eNhhW2LVDCV6qzbQ`ruIQd$oKl-qGl
ze>Y&6s+=oEQD+i>pzl6Jis-3VUIGkSGu2y3syZbX))Qa(UX-?a_F78GDv(AW?jv-c
zNiw}}7amViPTKrGKX<;~RM8s^#DaxT=6E@z@_M&)cZF^Pq((T-Ojj%s7J;Qf<7|Qd
zWvKiB0HEu=BKdL^Zp=ykRR-%13%21dKiCJQ2+_k%`LdrR;=?cuNZRxHYJ{bGNf>n2
zT$ZfEu@K1^rRM0P4akeC!w(um29No$xm>JBBI)C7;|*4s#@I<Y?$-miZWb-8vPLr4
z#)$zdw88kc^PF~y-yGnqN)CHx-~d#iX0(rjE^Bhq;UArCNsdxj=wBX>0Hb*a`IKD^
z{ESa79z#}D+$=5pjOoQ+46zCO!80df2XU<H+ph!~S(kZnqZM-05(%a@ph6EXt}rmE
zv6IpPv~XdQJBG-5kUNI)C)Pe?8tSK;u8718pt6Qxp)ay~40zpC3lnCP64sd72WUdT
znN%q^iKSEAk=peYG<%333Rd?7u`PFy%U`#D6{<6~a{LJAxt$CW*(fcmOn<L%qu3vf
zZyEF;vV+nk7(0&X5Z*CxQnk$Nb*R_tqN`pk=dFI}fdh(p_z)e?DqPFlDgTlIDJhT)
z;<&oXT>sfZ@qr*=5t^|FkRu?QsF@MekPhUQXPTlcM?qZ1$qX07n-Gx2-t;$2qxq!-
z0Y$?Fh!KrsUe7uiU=Xj;tp2i>6r2z@en28S0#~A&{FlgHfn9?TU+T%fzOc<$+CjaB
zf>{V9v^}Ri6Vm$Z!ZM3MO`bV&oX}<X%d=MsW#akw2@|>E&*lvF9V1mOp%&x*RW&BH
z{$EuMHI*OfYMjd}bwWN;<lNw1KqxX^;Y}^zdG~{<PKxIO#D2-~z}A0)rf3a0A_)5V
z3ya&$?&exBJDEN<T`HD)LL_h?Le=0_qZivyiUe!Uru76fS<N84D?1UOiQMn+w_+j$
z`3V+Y_^4ZtYc~KpUP+t^>x*(|L9M5ED!aWw6niJg2HPi+E5c9>tOd$)z)Knx;nueZ
z*aP>8Y*ZKvE7K6o!NsEFq!MsMV+bmITJ^X$TPUQ{7|FFf4hmSnCAEJYp8>SPIp7~C
zTdP(|%m(>i1lpj3fU=8*ZKw(#m=$rTX7Y%L<sShHq-aQ+(@nl;6#QPcr_DjoI|aL#
zVr1+@``iVEeIOblfNX)j&s&6$tz)*$JIO6q&p@QtL4lauP<{fj7^}=cSEN8J$yFbK
zD*$6IEyQTN|AznFA0cF<vhrHV{mB9;2@`tv%k65w1rYompL5^OwzL({C%CWwdK?Mm
zxoGP~81(@sE&^Tqb$n^^r%oW<dDv*H7rz4JB}Fn!0?5+XFebyc!dzNJ+p_=N?)jqM
zQXY_t@(VxaC@~P8q_JA3dmuG7;ye8j5XdyONY9VvlswbMV}$O$fI#B8TXxh3tbVhN
zAig3-{Igu%)7${XHO!W*FM3~l6<~`I$P8u8YdtRn?BM(+Q0e(X!>jF{WvVOvgudKq
z^!fqW5jYaw->s{EL3>0s8N$Z2LZ}F4`@TJ`9sY6!x&DA4TPBPZ*M}Ojw++~jn25(J
zft!vCrwX@WI%g%P75EJORkprVCyAM;$As%A#yCpLJCcJ^&TQbENv8fLeyZ0qf#)$p
zMM@)Jt`$41p4hBxIi!k0pD_naa2fl}D|O=3B4@hjst0R>B5gsw0D?G-!bUs~c<Dx<
zOKgi6rfrq3`Op*dF!_zF%>HuY+RL5*Bn&!><<388m0z1s`j4hE2+i>tox^WJ^NtOG
z+}Q!Uq?l)UNNmgW&w!D2mWA=660cA;3kOr%X(H8UdmwHj)i4?k*a|#Xtx};;GR*>r
z2JNDU-%k~8GMIkzdOt$_s0PG<-_MEuEoII}vp9fXH==|XVhk-5aJTAXVS_Sgx-8`O
zS1h$y0Zx!V5eb!EPkbtq!;x7B%L63@v&Hp{fVS<tySfVkq4)yA{A$IwC9=@zb(C!_
z2w-cV^&eYYW}0l*^d?OdTaazbfNAzYEO&vS8U!Y|R;}trH*ruPx%-fsnH_hfk#GQR
ziob!?e3Gmkpj2I=el)r?0$ht3`v{Vys39v9qqs>0p$XhVoFgJUJnD!98FiFs1G~@j
zy<60`i^2>Wc&w7`{M(bI{`&GeJ^9eXgqxi5yPT?J3nWc|=Tp*#rvMqJrNx(F^EEt&
zmv&&M3H(j^yC#{eXZeL^FomIJ2;-$`!x!ZOq2J>bPfh`ba~2vEzlJ=Aqj3$G5rygO
z^k*QW%&^e?_$;mzFyzL7O(zVZxFmn{GNsx!%&1u8sU1AgDft=(pVjmeXgGHaN*Oi`
z9ytTSVTkC@F&0+Dt>{$I*QO~VOQE4ugRXH(EIs}h?o07V)9T?Dg8jX1a^#&^-P$fC
z2<l@ZrKom~8+a_=R7nTGS_S`r#e5)73<Q+?Qja#k#Trn)2ka?skX+-tQAWSu0KXeH
z$pgi@KW!*xE{KTAxP;Y$<#Y_<h5|p2DqBO&I^TG!c_9`irfrI`zOrvXHJlsHj`_U-
zg|FDuWoWp|<2}GR&i!osS$;Ex`AHYQP{oKxCKADE<y(C*L=A1w?TV|eb`P2may#p8
zUvNW|<06mkDtANgR-pFeLJLSa_&5{&aGLWh6W*nO85k&|^$J+QLjW8{R{h!x(6EOU
z^c?y!h)MS8;s|NN>Cup#g7ybae@j55NPfq9!9?dQGn!9!6lw%R{l($MtJCR`^ge25
z`Wuvym7@26-!(skE)>r}9I?OqYpQ+#M($20z!IV}$i&R4J?gis6~7);b@CP#L;{$O
zE$*fmOH@6Uei^F~FoTmb(p$3of(*zc&>t@}c&C-iegkZ<3AO}&J+7s6`6mua*+|Xl
z@A2HfPx-V>8nX84G~y*BND@B&ey?TB7&nk^5><%cIOmoI+iS;A82$iTloPd0YWf?O
z`twvdC(SBo|B0L{6ktJBmqZ$k^S9qZ9I5Jj^&mN{qh=>#g#6lTNBXP24Nb}BrP=u#
zyyEDFV~>b~;*<wM3QH5t9A-A}rMsV1Q6hcU@SUamS<U2KPGu+W7*I*C6Cb6*^$Yg4
z7gD|po^<~S3f71a!K-@;4Rb?;sUxCg*=oodfGL5gsSa?c`s)@HFF!gSX#;u1G4L8k
zIkiL_t}CA2JBtpTN<vX;F<sTBMRE=BCLuW^5j4`O6^}{YI_7b5F9Uh|kkH?x30mbX
z)*T7x4=4uB_yL&RV*j0O;NOqY{0jgI{esIl{$I`oB-q4u=)ODVqD%)#mim>Yb=Xe;
zD0mE9hpm?e2~M?n=EK56XFy2-*di=i`*3$*r@uf{3YM$7wc7!B1C%)*0aO7;ePUbE
zfwQy22~mK;Z*3Nz!k19+=h96qw*?hc^VZk@-}_Q}{y*+ZNymc#=7hC8W!^GM>v$dd
z8$gf8J-h;Y80yY9LJ3V|&)`7$)(2EB@cr~Ci3$U=F$6I_)yK8NnAb;ub^olyGR?Yi
zfuaqVf%@U($)DyopUygU+ika$5(%G_(nC*y7&aSL$NW*tPT<!icQ?>cszCmVNOmg4
zhLlZMB?9mZ2$bG;-x$g9e0NE?owrm^%In79+3$l6K9Htv%&6+(1ECzKD@GWHg8=s}
z75CK$(_sRVrMRKr{re@L`~YEp`Ev0X5UCDN1fV(yA`jG=K(N*ckkq~bP{Htk7MxS1
zulA+84G3vi(LnHc*mQ)nv!d$QcAE3O`0{a*ph@Bj?}{M6b{jN;oY!G%4f_Y2qW9sz
z<5t202Shr*#9!wCX^9A=5@dW%Cs+_TKoLt2$P@|zypNV}cx;Piq)}&pU|(5M4P7^*
zqWNxG`Z@Ihy&usB(4Uaqz+Z=TU%$JHhr*cb13NEb+|Ma^+|Knz>i}8h-Njykf)KFd
z!YNibz-}nokD)@R-6J@Hwne-K7+E?5OrkNF9)L(i>0L0}2UKHrU^B*a9UJz3AoCT;
zF{s+#CC{(~Sy_ttK=n-5EhrOso;0Z5Ur&8?W*y_FSj`G#4_8`K1Qo6G00HVuk4~As
z0c=)5YWKW*|Fepa*UQZfpw_QlM;Q8{E(*(!80WMcW!ATN%nFiJ;E{Q*<=7v#6K)=X
z24mbq-s6`-QXZCtqf>@Nv8}Q%1G_KsfHGNswp?AHO|*<Sz~s8~J1l)3@C;y>k-VXO
z=qWDb@aabe#Ke_=;E$Xy7BjmaM=Dl&$e^n+1X9P9yN5rX>4O4@`rj^G0Z#G>rK9q?
z=+byj*VFR3O?(azq_m*t2OELwj|*zQnbp)koG#HHBu|;Hf7S*nj-fwtA_AaI4N-A%
z)kIhcZb8wOG(ik{-S@GtxqkrwJqQ4pWc8D=A)R*yz;XjH2gem)QlTTMW4rdzAK@jw
z5bBptw0pXquwBeo9R5Q8%FR&ayhR$r{T&^L;^SW&{RXW5VLyga64;P;-hhwFgSl#u
z$#iV`$bPFY*k+|ASR#SqfCW+~9aF=MxLwOAPUISvDL|virb6sBc5%{Z)+QN70x>_(
zDVxyeo;d6GI0BJhij`Ru%WT+ZD+50P(a#!=qbxnBVt9FjY9EIS;Ls_8fG4Gm#O1K{
zya-xDF@CuAMqM9R@Q+lB(bBZ*?2wUgn8jnfq@eV%V!0BU)H^B8J4HqGf-zM7JjVTu
zOjXiz^54z~miB0b3*OOp2yD3koDa1`d?3WgL%NQp2JMHv4j3tNMbWWzay>S=z5{qw
z$<g(2S|X-6HlA$4k*oxmI%Glz*+247%mAys8qjmgVyy$pS=Krm*vOXTy*%%yTbA?}
zK!82Cp~$dOAYG;dm2s~?07(X4VleF6*Z2^KekiOI3X%<!?oR5)c{Sj<I5X|K;q({l
z&GiAb6E;<^s8fUyW83%jicluUlMLnml=hZkRkdyZpoDZQNH?g2bV!#V(o&m{1_?<;
z0SW0Y5d=g+LQ+Wy=~5bLED!{wLAvK$JU;KtG4F@}9LM<W_TIwYd#!cGuP){0)>Sc|
zW7tV+i<$VW66pmk@eeHDprt(Oy;(R8KSG@MOX+KoKLUrgpW~QVzeufqt9F*5i{c~t
zMd>X=X}2MnToP_SJ>Uno=o4htZvzM*nP~SMc+<d4&p1iKA?#2tp0z;dvkF__o>Ner
zCoP&k3NKS+P6`Y6mUu-#BV4RlpHFtl+$|hc)3Jf@gYmH@vpdedTdx#~XP#G$n}~;m
z8}gGTTGuB{`u>II`)bq4s<Y(;kb&u8JI3{Sbmce3@7Bq$S4;@pv>(a00-8$S`}ZrJ
z4j#f7@6^m>hQ|};U!~!0)QC#LmFvBF_i;Ol;L&q$>-hN7w|+t406QTHm2wu%6`vb=
zcc3H_q@Q($hkqopdgD9fADhir35*s$1!Kjj0)Yrwd>8<SfP7W?OqU4p(j7E%l9Io<
z*uipg?QW@w0InxMla~(kxH=coKR*cwFD_!UP)(qj8;7Sa4$c2-<A76pylnl+aK>p4
z`(F*o9UawZuYzoEVw=cAg-XY7dVYNI89LnPDApU*-`zaFDM?eCuXd&C^T`QyRePV0
zU!~`dh=_wRzx?113B{4YnMGgVJ-Yq*p8rC;zethW;^Bv<a(I?638T=heAs9K8ekfK
zAttYtj(k|;=8;2w4i4+PoBKEaE0VxQq=vie;-KFs%-;ElrzAtNaes&bLn+gmSyLFJ
zZ7)87MN<uDU%)`E;g_Q;CN&Y-SzPU}(%sba)gGRnFqug*)EM^@jtNDcZqj7pe_F$v
z3=*($2wp$m$qCPQauY|uum6AS<Qy;5cl+=`cw`@ZXUhVQsoCkeoyLXWq1?7fN0Z&#
zs@Q!`q@JvOr%t~xuQH7WU;m&Pd|(IO{4=93XxOwQ8!Jy?Srl>U<Ib*weTVtKdIn-_
zx<{H5Hm@Af%{#*O0qc>M*5R3NBfP@p-RneO6PdVpj#(pxs>m<OVT931W83ht$a~VY
zEAC1c7G8jd#S&%ae{n>Pl=v%n<Gz&2<+8Z`aQ#I03vSsP9A#bVTTAOjnm6eYZh|mY
zrlgpJrZJ`OZ6|;MH|w{DR2p$zBp)q8bv$S18KuT77H5pqB=65knst!tNBw&Ix5)H_
zgBI#P+UOdvz}y~<v!CNx&YhJm&lV9y+o=CD6|<_9(Er({XBhbi=p{tCV4*H-43PZ+
zl;~1JEwWCl3wdn)u!!Ui5WTCDNNzquB0#_(B@y@T;X=P5J^kw}@IKhpj>yvn@;QjB
zACQuK%Aod-7`yFVy|@T87gzoYi>hmX+D$~_a$WrW{qu249{jr8Ws<PqVF>v8!d9d^
zWd<$R76CIRz^K>;*u#~&$E(x`*B%vmZ^huHc|)+o{1E;44ycX3xRbMDvtArAobo&K
z`2)0i4SXxQ9aV_njq?b2p!Z4KPN$Pn#~bnu$|E6)k=!4W)AM|44UCgmPTaxvck!?b
zc@_%sfrqGCAx=&tUCOnLxyio0K4lsRX3E^V<6)#M+td!C2^ZRT>aB-ad^;G2X42;X
z{K8xPT7?gwND03*D-P>N*fouZbFuG3PYb({fQeI;8UNZ3wr~oavG}eo(6a{cwU+Zg
zwLk_0edw;b7k&a9j#A+}GvgJh-=66k$6<NqJyN94Lns9`XjGoJuuy0W#UcS6pZaH`
z&r?Y!FyPuZN&=?o-QkhUXhxreSX>n)q6%_HBlybOaqZ3Wvps$yvreG02XEU~{&q55
zuS%GoUidek<rP|6{mn^~pI)gLAoBJLY7fo{NsaeukvcnmSYp-v;yT)LohfK!%mEHA
zH16mt%^}xA$*wO!b;pNkYw%&kZe>-eo72<3zq{-w;vK&G=R|93C#NF1R6%-#d!pYp
z6d+hhb$EwM=~UcwdPofrY4!C>6?x_}t2)$ZjRygsD5*;J*j<_QYHY!xX)i6l>P@5Q
zdjuf3a)m`@TYD>VF{rN~7o+_QOpF@62hejDYw6&pErVehmF9zOb6{>pEVZ9W{lM6z
z<zi~AlO8GCf3pBul8DozQK6oIr?etialn_>6VxSt0E`XBnFy6E00-T5fk`6-%y7Dw
zSo&Z`e<GDlpe>%-9p9estR?vsS)Paa^$hKDOS?IW{?wj9#gmz3n&xOXv>3>=eRO)T
zfNkt$?0Yar-R?w$C>MMatn4N{1Kkr4uV!Fed0ZXuKBcHjCbb>7bj8i5eD(;zYT58E
zLk+Ce@ZC!p64~G4BIWz*!V%SM3f|vRkdJYu^U^o2i@``~fyGWl*-zrzPO4<PpK?E)
z3eYNMNe1BusD04t7H5dJtKPg?>@0Tvz7Pa8$*1|*{lIkc7Aa3VYC&5~nIA9MrBc;~
z!D(EwER;NInKkjj@f&4yPO~8;2OohvYId_lpfzC+Q$7_T8k=5PDiN{c*fG=FaXOkk
z1mqpdHy7vn4dv%|yCi-hmjeeb2O;L8Ec;TIz*31`CHxyTnZCc%c>3G$Sj;H04({e$
z#XXpj?1k5+VgbfGCV1Fov<S5Y>12Vhw9(2O{}+n2L$(9E?S$%of5C|DUq0Sk8iF(u
zw}?#@rOZueB;}i>d#Q9#^Z!A-!ZuGEoRrL@fUHa-CL?yiG{l6x?E&eJ6_NAbBeNwB
zYg-iYW28|iUZ~1aHUTvFv9rfXjLeU7p7Mo`Fh=$UA`?y%lPF=)bj5TVH$O=!7Hu~H
zF4OFstK@wA|Cy`YeJiMO!N4g~(~I`GFEj^(hz&ucamXNzgE6xqHALwd;d3Z-le`b-
zdsPz`R1l?GNJ}7gD7c|Rn~vvr%n^S)A}!1ND-jQm&<GED75_~B6uajlylGMyi?1*y
z30%>+b^@$gi8T^1uo?CeEiLMFN&ocF3TxuhInu3SwsWEI%H%!{A}3m@TYd+;*R>O{
z5|1p6ea}Az$r}MkX|(+;Qt<bJ;4NzMAngdF7}+b%h3o2&df@Wa?rp)_pUBh<!#(Hp
zGQP-O4_Ae%=vvLJ{r+>Wwc3SNJtIeQgg=s&5SX?4VhcsU9Av$BP0ddZckuJ8)5@7P
z0G<xL`EWi6ac^mdSS%uwl)GF+=nA5FjiQ${Y`Dn-p8J&UT?OQw#NtRuA91G`Q@lld
z<?N{Yuia;AGgy^l=_NN@@}&)~Dj#kyUV1O`?LmdrNWRvl7yKt)O7D`;e@xQuiD8z#
z$r+d$5}QFLK%$D1UfJt{2{j{bx9EMN+h;$;%t3+c{Tx*qjj+F2LD7lG=FJbNNxZ{D
zpLXf6<!~Qf%!5i&LUhloBnb1*(Z^OQrcRV(Xn&pC9F#L7lLv@#UZs8}f^%ee9k#&t
z=1A$G=fipy*Njq{MD#+3$g+$?Nx)Vvu${VrJkm&{0wIzfkSyk27;<E4jtR>ewbc*_
zLXx$H-{s34;sLnQy!ja#A9kT)jySS5+Ik~hA)lD?PiEy+GyIB{?X%PMGr?%SH(*Qs
zQ$<$`c@(FlCG(xpG*>j0)S+e$hbw@TttYk_z}HMtk6NViw}mqJ=8rP~5jg!)#b9U(
zatVIrXU7bccgE1sAoaI&F3D(~;(X2=#KEbd(q(ye!@h;O%7W5td$oEBE0RIf#Y_}v
zW3KL4HWITRF30<uhbmWBr<(j^@Inf5ud5vP@H9!RR8RRL%f_!TmY6~3?ilG)522L}
zNy=f{!&;WX4V)Uxv5n+V5DM)X7WtSRL3eD|D+Vz#LPUF=fW1|rEf$e=d4sN-?{!9R
z?1e)89G#<--(K+D1z*?B>wU?j;?;O?=T*-+gxL=lRm{}$9F8W8MCjWY?oM>W06rFj
z?O3+?5Q8)yqwV9Z0SCk4Z;X5tIK4}IllK`Ke1WEJ1!E~mp4ux4N>Su{A>3;^#$jaR
zLo1yEt3@8JzM&mk713NkxXe_e&**br6)(PaT=;Y!=UL96a?`<rF_rIref)f;_Belh
zc15PVAEr%0ZU2GBLfAVO%{#WgY<gCRMHyuN_za4C2z--L#*h+{`UDmt1OWfyp6Kwt
zL9c=y_yzNcQgg4E7o~v72O*pL3IHC|^L}$o8wiPMV@4)xI+yU7OH1qo@m}n2DPxGc
zolkM$?*Jc9`i?++gDALmKG)^@4kJG)m_!MaO<M9evn6|2;@x;|{Bu8v7G+dn#EYTd
zJKMu_D8r?2iLd?o)mLv31egFZYdmI&=}f(OyTOLpG#?kxI>O^Ljz}3t-nq6pxIx~P
z6K)P6v%fk{4PKf0hNsOxfUFZqSg~YU-}#q%GlC1<!98F=0C?Q<R2&L)>-YX8Rbz&~
zATGwPc-N~Trr8@7ftw7haP2B_5G1bpr<X(Ig-+-4MMm%<4KH#!7qDn5V;Tpelm#2O
z{{1E_q^9@o`8kv}Zbn2fKKnYz%Hzi#d0|cqYzi`n6>0c;ITdAa)Ba54+{iCGI{yfk
zxQ|5316CbOyXbWOSn`Z}KK+JeMI;pv0P}C{C`3YebVuoaQvUlb#c(})B3`_8aKo84
z>Priwus;U0ee{#Q(=ifdVinu{eL4W@*GbhOO@iap-@NPO86pYT$jYt-k1KibCze;t
z=%(JAx6H1b0TlKjpSTKGCHdjrw3`Ay=K=aE{Mb(nbMmyB8B+20F*y}HeCT=2p4FiY
zfD9N(?186=HsNJNEbh59p<5BY;K?}85PQW7;ma}NYdE~|5O898@l`|+qHG$t68m0Q
z!eSQpsJ!O}KC(klLen=<CbGkqc7*j4VY<DL*cK1SgOO|UJ*oVDcXLa34Owh7DbTCR
z<+zHlG*)l~8PPp7I`5AIi+;H?mcg`i9+04rdry7{O$!RDRoL|1T}b3*I1V`Z*FuK>
z4lRoMfe^ehN}gC!CmW$$Hr|{7Nq5n8@{R9wezxW@q5?O{ITtuk;RqVxuGB81<4Zo>
z9bt%Jbd?I%Zq?N@U`z8oT&!y>1D-k*0%sH@87n9(6n>U`_{#BUap;yGBCCr=djS2#
zP%o5-?$L*64i3(&nAULXx9YDe*O1;a-Te1W<c%TbYYvEi>akPZV%`#KLKw2b{3IqK
zcYSyJTQwkv8z55;rg5r~{dT7|O<r_Rkf3G(2>|I?9V;<Hn21(nL`o07j;)jyJmy?k
zYO}rfuZ3Zj4&(ymr~NvH9;7X38j}E(>8<zGXm{lTouoK#<;u%vyV-GtICq9SmH|O%
zlW+v@hbDm(nT&Jk@EZuk$Tr<*-;cxh`5XGf9p9sGQ}z>}eXu%KCrj~PuF4N4(-L57
zMQt07DF~j1Urire2D<R7XpS*JXqSe30nws1UD3c&=u>%x^snC|+8WXBl91>TUi^b~
z=ADV3=)KC_h${;UBj$WOc1SKr$;<MZB!3Z7#eBOO3JfgAJJ3iiq}H;)U$p53<~S9F
zRr2?@2=$5&bjkI*(bDtJwi~So3l%@MkgGN`+|Qc6O2D1h@`cUEVW1BYeb1JsJpjh`
zt8?&BQUuS7fi^n-`6!u!z1qPB5hGkxtjyApyDF}lb*mq~1w7l{(#2_|MPH_amM~fh
zkmOQ)@KYRI0{4*+8mIj6)|CK{-@3CG&~ZdUU;MW*=5g*`4a`!6&3wtctb+poLm=is
zHE@3}AkAE?i{{mLw>ki+smbsZqIW@TRkyNF#paVO<<3q|go;5m#Ni%I$tU;T;3$R9
zG9;Y_t9Xu4Em<t`Gx(dBX~8XF4iL|H<$23~wKX-ECGTE>cf9ol$j)L#Dhlke+YSae
zy&Y|jMIi=hi=F3x9&K67R3VItj}+kw<NN>_I?8r?*PsAG6pXs%^O;f$FzW5EF~+>2
zDIH&HTxZb>YXEGt6HDn^E}Jl2=zlFlz2$^6Y=1jP=AN@GIZpVCx7Odr7mIsx!euGS
zKeT;Ey;q*f_z^qT(<fgOJZWaq{rh{jeW8+e2?!pw)P9zc1OYzIa^mU>%8(%7p*cuI
z9<QK;x=9Mc^WP|Bycx4*wn_+j6moZSvGSa?nj~>yiJ?s^l~K7AxvxkdM<MMpEmNga
zQNyLAT24h^@LP$x4jwom8VvRwZx|B1!0+#%gv+5)o6b&yUVP8ue4k^DynG1J3ZehI
zy|>5ZPZV~u&U;tUfj*0)uY4^r_D;t_&+Iu?OzTg<<WC-&X;WOV+IX}k9=#9l6pH`|
z6Vi_6!D;@gx?N7a2XY>Ylb@gQWaQ+6JIs_1DB~GNv$}(LKZV5A?9+E$E5hp;I^vT?
z_ofNSc_5vT&KHrwZyjON`Wb7qjMV1uAhdsb*)6)1D8_r1uC|XrhySfWT`!{JQ_#5$
z!J28Xm@1iTgp;xux8*_9b<%5*JJk1;5{jczB+ibvsAJivS@Dm=Fs7isY*TM7Us4xk
zQhg<?)^yPec66co35GG};#=m|KT}gPG`{&zA)He(6St8Ly1IMSZ2^NrnvFj~>D0*W
z#uo)c{5$42Keeu)R_Igj(AuoT?QkyK)>V~Tss3tTL`n<0BU@GfZ({<ZRu}@_<!u|o
z()sN${x3Ls+HylGoFee3TI&)Y>ePT@;@zFOVu7z-76l&M?dQ_|$2fxeoC@&|-UMec
zCWKx0u3VZmB)Cq-byhkto0`$vJ~}Z6ckhUU){(6k*C#6@rszu{+S#;!yOiEj>o2VX
zD;(`^@S4^RV=S6P&v~&1CKP>cJiEwX%N-=iYP?2T%ZCkE$tS9TiF3%P>Y-+`cBM#}
z6T9aYA8q(J%Bs6sE!QcVW2S@g>Ici$?qlH#X=Vo3@4ovv4l!fzQKPpvwt@|UyIENk
zEvv32gjxj>3g0G@+Bd7(PUAI~i+b55$My1lZ)vZj6u3K;3-(d@8DO-f^|x4m$Y*<Q
zji0~s_mrGH3b$7**8N%1bRD`~&^_{f#R$_IW%o#@VzPE;X_(M%CRC;Jl5j@^`{wX0
zJ%iizy_sIbNMElXlq7A3TWZsOLL~X?_n+_F9;tbtlxTvTOS&z&WF`<$@P+qW1XE6$
zzj|~mZS6OyE4)qsW6qx&-#$1N<*iYi9V}!hJ$!`UBUwJb1F<~$1@$Q%6_%KVTvFx2
zZ=7iBZwk?)tKLbL`OR|p^Bj?2fT@9e3FE`XnWyh8w!1ms2O-mF={NcJCJm2l2An8Y
zh%6i&I{rRQgG~<hiRLrF+jph8mL7dT6bXDXWpca}&H3nxA@OKael5M;8ub?Wp8t9A
z0fNCYy!67K%QlyK_V}Op2~<abu#VY02o0;#U9F`9+8#zFYnSc2wElS^GLr-=V@A%7
z_IrO;9p_=Xn(P&ya2_DN_DrVYG_JN;@@eUR-iVBV3`|kQ6TU8e7l@mJ*#rJbpB^y3
zp6`2viN^>YIegYG8e>?GUnFfXJv6D5D7F9RlC!Vtq?y5a5q<8GyYPQ<$-$=p*Y|i1
zNi!jl2@Z$aic{>>07PIjbnB@nGJLRuysH~?1sQ1oPtx3@su`RPl~+h(nkgM>6@6FM
zNxdXej3S4172n9QYSjC{rM4V6DP&__&THxD-l#caYF7LRn9}P*L-9YEf#^<}IGhSF
zUljW{>`Z|eQ<J3!ZtW`wg)-LwO|$Xo9l}S)t)#ZJFsdbn9|1{6+kR2Gmu*dKgqkSt
zG0g85U5CkLT{N0NUf;g#eH9`sYUTy*C*^o9=Dr-!3^^f%2&wE>CqDtoIw*gN!b78t
zoN<nJY$(;NU(9=ju$6*biCpOs%gM<U&V9!l7qN1Yq`4y_lxqh$M!&kNRYZcy)G24{
zpQ|x4*@*m^VsO@?EzG|!Z1U+LHFksn&J#KS5+jundX^3~*+9Ryo8KDu*VvJqsrxCj
z)F`^T{HB*9tM7>53SePXnbe;Up{V5biduf@dJNta8T61+C#&ZHfm&&=hd!NiV{|wp
zss12>HAmaQm$s?P*PmE$1{2<JpZ)6YMnMp(zsk;ydv!qw9D(?8vn!I!t#e}u=4UX~
zdfH8|*o*2YdbFKKynV~Jg#G2KFczE5Fv()^NpE4tfn~kaOc=|MuF%o}!|6sS>I^jS
zv;+9Bje#%Ql-l_14QRrm)80yx2aG=#A2%tLVA;Sjop#8V&4slFBdo?~dcIW43hY*+
z3=`CP)8inR?jsJg*0%8OVDf+N1nGihb{&VO?Uyv7tcMiqX}69ug{><ras2GS1U4!)
ze;|8H94zE0-?ebjgAqoF8#E~jX-+RYHYJ#|a>MV35++6W%8T@_!jy<7^4%rS>GOmC
z$R-sPGC$C#SnKcniKd8cy`S!BvrSTufW?>8s4k3VmwY#2CZf&`HLU+JGbyS<bx`jb
zQ6anYLbcX|J)PhB&VwFgA2UE;iMEa8C?%Vrtpqgw6i9{yiB`h=B*;322#9+-uC3Zt
zKUx3T1=owt54OI?T>6ymWfu1BJAb~g%&RD=W=KhfytIEgr}C1`!B&|%5@HQiB`Ihq
zUJ!rr3&I`NULQ`29C_=C5WWvE_O7b3n#^__<Rqxy5uUWVM3{XeUuSAKbPOw|{6q3I
zQ|iO|>weG=!*45jo&3p;Ww&?{A9g4lzkaLo*qh#XGe_0eKz~GVjHnf&+$h@cUTb38
z1$Q}+ePmb7scWydm?bqzewbx(F1zSfV4ONi%MH^s0$G3k#Ie<D#bvSr#rNi#akmM)
z`tXUnxTNR#^0W#G`jWR^(rZeC2uLAKq0`NEjtEI-qV6;!%EeYqIk{?~MzMu5qRg??
zxULv)W5(a6p->nljrHi`+dI6xd_>`MEfVE%#Ta(<F@rhE2^Lf!QK<`QVYOW-c$;pB
zL1TbD<nfW%btFXsq!4^f#WZ#jKB~ShA+V}242LN#s`@y)hhe_8R(&ldlqAbw<oDU@
zh5EDhDMMzm%)Gw6W*Q8HtrQ~k8(SAHh#>V;H%qQm9vc#T3>@dzbYPPC^C;d=Qa=Xp
z?i261wX2RPZr!K<u0@G%7{A9Y`_aP0^l$MUk3NqGQ*y`@Gkm^~wScJgc5MBU#$)ub
z^gQxEt~>#ad<aDReJa05l^H;Q+7uf%dROan|6bRUFVK6IhZa{av8#sd+?BTfWGv?x
zVn5)UU&o=G|0im~DVMhVsVU5q7Q*=Q_pr0_RNfP3h1~iPwqW~t#1mxo9vmUXs>|<T
zR{_1y_Pv`v&enP31XG?1bh><Z={Zci3t$(ru2a8-lHw=sw(I468aia(;q8f=-4y&~
z@sMS_SWPZpCmIzQ#CyRN@f(9Gl$oG-ko&W$cjGmhVw7o@Tyly)^4C~2T$Tu!hh!@r
zUM9~<k31~wG)pi#g@>n5R+x9LbHQX-vFN3xmszixk?HuGrKnrZm0uJ$>z`kh2400r
zU8khm+iEv^*%Hb8isD?z5%$Tj2|$sxa`nRwh2mZ|1)ZpQeZM5K?+X3_1UynplqC28
zH<#k?G+xryp-Y^KESx;6cCQw$un=px8uFhr;w(&>?+JQYL$=LqFe38X_<!(Yp?izk
zOZQGN*>n@IqSm#NiUTilbw`jYUeqjT>Fu6?8g1Dy!n3SEF>0oy)n|9_&0lSd<imVQ
z3<4IdSCe*Tf6d5L33@&5zG~DENCyf)2tid5Stk)Mi59Va-<wv>3G^uKOhoCLmm>M}
zLwaPlK**f9C!pejsIk8mf&&x-5y}Y)jQ9T19$>YSvcsK2P3UhZP`XVpLivJDEjM(L
zPtakkxVayvem^-V@=K_%N<uEdh#UdMuKcafp#`^DMk=6j6pjSj_ZHI(+X5jbz}OMS
zB=bGRUv_i6>9p!YU?qV`<rl0nz#lZRIdn4gc$8@V9ORGQ9(%E(!+kKcq&#9~a7{n@
z{?K&_XMzb{aODk?-6?Z(d~UXH*c5kFBubQ3o6qbE^QUN<w|cpVmS5)0#mMa#irXA<
zi78K6LD;$@xfs<<r`RPa$4hANQD^H(j>AYON?$JM3bTuxy^3E@?~{AwA>?LZBm_8-
zN^1gY7Gal?X|wEw<>sOlBJhT#%*~iF=uvj(Gz1HarV$Gf!IhlF!#2hX@@U1R+^SN$
zqe7xwpHY-o?ATi@G_X<>#+n3gI8fC_dh#c?dlgHGTZp2kkYc;PYC*T_H47A^%s@$f
zwGP3hwI*Th@(!vpu9Ns{*>Fu&ZGE77_8NvObKsP>bq+=lRneE*PK{7$_1g@-zd#ds
zo%^TOdx&H`oFCB|S)J^R$C^`$4T|V7?3*R?A-wMD)wB<m;V=6&qyw_%-VPLCU|hnd
z5wwl_P==aHcqaB4Lw$GUS&8@4+ylP)Y3d$DE)u?XEdiG~D*r^HeNq=+ePQP433B<X
zD0Yigdj$#i?&^CKITU4;7afMY9|2goc+z#1Dk9iBGo2?T6rp9f&uwHJ@6`3ea`CD?
zROhKY%p_3EV~JLMirCk%B(c<eg)g`TMfsbzuva*hFbuC5x85aL7XAY}lKQoe&sm?}
z`rCrdaQV1Jw3}(@3TN!$lH3dKyH=I$MJfirxySZ8n}3V*3VmFlVrq!LUFy|M>d-5g
z=rwJwb8z^I--=s+w2P18lR%e%X7Re+m5B{5gtnKl2I;@lmBJFe(`G9yBMNiD_z%B1
zI3K3BuPA*^ec5}wGi#vY21+4Cp*I9~sH4P)0-wq4;I~B<DsT~Us#TIE>M~iJ#5C-b
zlQA5=S>sa3|1}hX6FfwYC!*3Qw>cMbfAOhK3`$^rH-6RL;*;BwFlU#yryIAf7Wtbz
zF2(Okxn<Y&GCm>i#14}>$Uo6Z(epaW_T}V9r`5olr$}Ta{vPbRY3~wC)tQz!xpkyQ
zMuq5h-#ZYHRlTHUx?K3J_bYzKLP*b^mMa~nNX+t~S?j@4fdgd}MUQ%z2FrwX_6Les
zcgM&D@Dupwh07#ZnR)ZHudTy~Q%Fs<p?2wLv}}`mhys5N5@Fp}NNpyfZ&8}ONd570
z_``Pf-MQaIA1ZGP@LIA&*{7Yuh1anC*}Ix+WE(&6;kSIJ>73~;?g#NtofyeA>oF6$
zl5IVNIuK$3!Ncqk{bcOB)b*}!W332*73^L!^^Rpj02$)>vL0`oM^k5X6)q7uM<IOE
z$f5i^mP3~QzkH!-oZqqtgXMk|LMqBg|A`Kxf6$mTv1p2%Ya!SY`<k!O@>D^BlO^+i
za8AqJknvTV5kmj(dp8~IAYvPEOpxH6M{IK-T!QWs56geRXF2mU0T<`p=;Rp3vVGfc
z1rnGOimJQ3y)SSFu)Jw`p1w;|js14z@#1+hMJGfwp=M_?svpzdPhICHqXiIbNelIT
zVnJ=)Ux>8%M?aE=7Ng<#u`u=e>642r)PQHMg@$9ONQf+~sf9aGJ$_g=1@IeP_Yd?H
zKcT}E*hX{^=rvKUTyBn0sFY_K&^lTbLFxPR3!IAkUEVm1u4R(~s@GOOJ!lLfdsf*&
zpR!MCInb+>t+<$;3LH~WJgv>=F)NP;^Py+uph7HEq&{EsLX<M~^uh?IdXuzR)af=Q
zqAv9dBGKEwU{u4KVd0`%S@0w|k6GQg^X%(ry}^QVp6m>^8l5Mkq-A;i0F{}$Nf&+e
z?G_cTvLGis*wvk}yRDCkzOF=>=KB&OH&Y$cG+FZWC$lF&CWJtWR46cI1xE=8$|WS=
zWiy39oJf#NV#_q{*VYF=tXhL`&l4|Px~ZhRYrF#UBHO7fRC&~|>N<9?w}Vf?h(sFo
zH8Q_^gMa`KEM`Vr&FX!@;p5xrB+mmzUprwBp$U5awc0^NW7~D}I=50U`jPr0LUv0N
z3Y<%U8=Zu|NDYslU99(bvNFM!_Gn1CrT(r1`nh+hQy!Ud@o!Q(MEqh=iqYX|n29lY
z#WThFr)vAoBe$?|vv+{27TRqb_X<gyKU=a+eMt33&=)8P>QE7LocGO1^lx9CkH)v7
z7R+Iul!~zHmp(hTf>uTC^fRX9!4STHii=6Oh%F~fol|N^s8mDek<g}xcIP+281Yw^
z65XCXnY4a+ov6H1cg%5iC#6xKbu%~rN&+Y}l+rv+kjE?24d;bd2L9;C3>$CpfV#w{
z!Y?$Y?WIwb*xMGh?X5nkZsRf!eDJC+vqs6s4(F*2vkWDXR_uc~Lot`9wH>2eG1tda
z%k<;pRYBUR`*wx^Dz-w*33+}`&O5|dXclC4_bXtJlrCBUU$yE9LkqOu=0$F7N#OJ*
z^Ra*-<qXHpk}66lt~)Lde4!=F))G7hB{W+)v$Dvt%7kesfnn()V!_w+p>opZ)#70n
z^qjZqLRylt2L!kqQUsG)-$$0+$E2~eB`_m%G7UC35YNSOZAnHnjfnba77|wCf@S))
z6~|0Mz++?j*^=lJH;dX*O$peZwx7H6opXN1=f9_{kt`^D2WU_#ls*D`z01~p^Igum
zlsJ=*Gv^)9_x5)QWHbFSu(4PpY9B5XR=wsEjoxlLAQ6R3)mBjW7oCWhT5&WVCv%s7
zW-ic_MrfW+86?X|dl>B$EGWk1#Ju~TmCEPI)Io87lKuxk3<65ckt<Bxw7&bl3?=T)
zf(!1E-<_q^k%EK}X|OkLW*f8C-<^^l)i-)2S6kkgizQy3PWj5AQ;=_Z;S&2rv!BKi
zmJ2^K0Pk1B^`{tUZWm>Fr*Hna?`uejWv;jz+ySF1Iq3-bn`|E=!`B$b(Da7?vV4RK
z?1ddGw`F1dzE@Atc0sb5W!a^+J(c_}8b!n|U2^UfjM&d{?UV}ArPhQ5%QawRQi?=!
zNnT`?V|}Vt2QQPY6M5g^S5}9F+p9L_^ME-M=A)RTw5Dls55Y5|TO`kQ2`JeXMnyhz
z*k3se5|Y9(^s>PGhx_uJ#+$&YA-8z`Swyc3NL_}K-rgmRLv}-^?$8e0vA`hL?=VX4
z7)P-_hibs%2@Y9H-{MoSRTg`2FSCaizi)R~BEzEW$?h$BS9gcd(Q2GfO^ME;JOcsm
z`4|?1%0_GQ=q!F`ng2y|W+p(D{JHHQZ4k?8)jn+17HPoUbswut8W_f-phkh`T>&$4
zHIxq&q6j#}5y-3yfiPw}uCaG<_jGS(Bhmmu1UxHy9w<<H*4oy523Qyypd=~^UJ)4M
z2urm&Ys|`QZJ%Swt9og?J(4x2h4~_D`9y~L-M_R6-hZJQim(k+sL1@VsEH5zt<PnU
z5M8v2F+dndIKOES=1an<bwa?b;-!JOvgcSJ;cD{9Ox$&mAl-9<bipps#H%mz2jv*b
zl28pEMH<PxGt@-2M=M`3?`*`QI*Wz;a<Ojg3isTvBlpI|Ufnr$JIn#1vTlA<XRYts
zB2FbMrr|L)%IQ8LuB_t>4Ys8GZ?c}I4=}|^!&kkv*~GcqHKnlr2LO|T4mP*l1wt$8
z|66w`0u8kk$VJZ2CjO&7`akO^f+0AHyYS<Q4V^zo3L&V85gdH>a){e~iO@NcWzYqu
zhE%D|d{qcW4}_uAA@MI~E1f!7dqQ6PGtv#y{=as>V!Dv@9@2b@^LFdV0d~NBB+*oz
zE`5wNUlhG|x%5<3kJf@`kkpnJ(`{;@nEWFiALrfleh|JO1+5Sf>FfUZ6%F}DNDuyu
zn2h0YiaPe-XDjB(<0-aTul9i(MsJd9-?ac2jPD~Y?Q5??5eIA=f()gXbynR0`e5`P
z_7Ux&q1p-hDL^Yhz^~j`X*xSSx2}U2LXEMQ<QG3;W>2LEP1+s>D3UtC56p&$oM36e
z5%k759@d~xdTCh?(CfK;3Hjmch&~R;QcLb~jfWC@3Ej-k-oJ<Q5%{>kHf5jX0Vef}
z!uQy@7=9g<m^S&=8LY#h7*rE`<v<`I4tB!ZF5BddBQ9n)LEX*hWEkMVH7Ie=y~PWk
z)fNR;OX$MS#taLLXI#3Bt5V3p5@1m-lS>OA3&}}T5j{ct>TMvGq{s#|m&cug!*^n&
z?7B(nBu&NBy)`skM@TKwVYeX?uv9!>$gz}Q3PJna(UsfZuEn!j+D7yX*ahvKyCBjJ
z*hj~&F}S&2zcu26cSC36d*mmRu#_Vp*yNzO37&!wrbr7vkbUm2yyOn3DMR3p60s1z
zQn}lgYd&7R2ljV_LkisAAF8p~aR{K;zZ%22^G<7v>X$1`DsV|4(@n2rV&!8wBp|#0
z5>eHot@qjm#LL1R;UP9LfKYy<B>AIH%%TVTvEI7@PX1+bSNZDe9QTT~O@L+>Zw4|s
z=_kiSRMn#SbhE!XR6|f&7&+D6(hY%(I^vn@H^bh}QLh<XGlQLl7%{tt_(7#9oDcXF
z{^UOQ)NwqMGGfV;rF{^BWS&z^f;*a?GG7^{67HOfr)4G=bA76~!7NA3PM?G^q?0NP
z_J;;<plB_4eSc*lU<DtS4|95T8`$e~)KZA{5V^Ij2c|JPXTSjr+*Fp3CAPM)*K~G|
zDyinB{=6)!z+Z%uO(7ST&omT+Vk9&*-4fV_Xn&wXx1(ZSH^&XVco(?=zPamLGiiss
zH=0NB7uy9MeL3tJChJNjicyMQ`JTg#zfjXorV<2{<~f_xoXopRDdLpIz6h#3v4V5~
zS6O~>xI-2qw=VqWJ)Q35iFp4|D^$c1EXL7dYTWFkDO*L=wdhD4{JunJ$yoVNTghBU
z^G2?&2|*<&(9C2GXHdnb5U{}ZpnkJ+PH&uRhzEqwUJR<t8!?1b+A2llLNg0Z;ArGJ
zUexd#Qbu3u1i2q<6CSCW!~#qyqONc5v@MECBLiNiZ02!%mFw5ZKUJ;uQE|_r`lXH@
z+%q-*5t&g^OVTtbhi_0`9&0!3JT}Xid_|}DG^xA3`J%v+YbTOdmLoOmOa-SGnjo{D
z?>ii$@%2ga0L$F--B`b4OEH6HO3Xy=Ws85fENHK8Nhe#~8nYYD>mcJ@yUnxq7JDf9
z+5)SiOkB(z;+kZaw*{yZcGsNY>K6||G0E#L!k>*~AvOo#(Hk_?gI3Vp=L<VNYiAr5
zxsw+mVPg5Y0P_Zb>k-q=dcmy9Dhh~#mbMoq@arjy&8D}!g4og8pY*3|T0QLm(eRD&
z-|wfDtQO)0EucO)Q{_|x`OwKOOfkqJMtKj=<Lt25N)6r`s%R0c_7$)S3$Dt%w9$HZ
z3pD#-PkbP1CltZYye2!#E1ahqT}yh%0@h)8KqnK@d#eb_n2Nkx=t!k+LyU4s;RyZ8
zb{+!(5hViyzABw<Gprc4s*Da8oAoy{!bZ)R^@6<Ji<{JhCt2f#@;!P$QLNT}fw41T
z7g9OD?89f~A&qRMyc(r8VEs%@^a}e+92S)^5Hlq!V*N}GaqIZQND4pYIZ9Rj5ZbEI
z@-;ob2UY@7*SA$8F5WK((x2_GqrMxm#9J_>hEek|%S!mvL2q|R_7=}uZ08q@4_8qA
zggZWVR;yHIUmQPnZkpJ|x$KpplK2PGVEE@)%qe7(6&kM*wkgmT;D;|D-ex-S$5DBs
z`TM*TN_BEW_CM;Xe8m;%S#+Y4=Mgte7KBOEnLGx`Xwj|g+!jHG>?laPP+CTTMJ~`_
zl)Px_Z*@CMC(TM>i#j#sKqX*~SOo2szhyII;7k);(WU(CR*s_LyDW59yvG=M2EBis
ztDiu!4eF=eK!e6PineLoHoiLoJovN8^rC#92H14;;nae=kYaX2$4_-oPNNr(Z)&yL
z@tZK#7FzB3ouY>6O;~KV`>Gl-u>f@&3qOqlMCx#&Ds)FdHKdUD`EKc*3wj1;NUuF5
zd;oeLJj7NPiQSKuPmg5Aq&IFPfhav3X}Kq-|GVYlmHofP{uArIdmr)i)^FWJuaf_#
z(HtX^ADF|@!AD=*t4$L|Hr!+AcwBFZG?u~h^`AT$PNK3ae{cjT&t3Qt|EFx>%np9u
z-J7cGycy)Y8smZ_&53FhPlOHD6@}4=pyf8MM<>JoSiljyfZiA4|4W~9wV;Ob@q@S?
z#aa)sQ}9(;-WB##GyN3iB~}aBq$527M|AYA?W8+8DytFJ0yDr9seXI;^L^RlTp{Ck
zU}q5l54yngbsFp!`<ty=)=bRY`wOSNZ1}H)h$(qx=<dz%UK;3s(0U#p8yzcCVQ!|x
z;PZ$QQX<$DhSeGRKJqm-O-$^7*{@$mk&O)T<J8UGwATIhZUN~Dho1`DeS~$WR4op7
zmf!l0OvInU<gUZu_{yoSU`;*aTMVpficg9^rMiM-=CI9=vJy#($^gWY-jH7X7Lw%H
z&tK5?9~#r&rlpw@<szJ%NSWs9-nvo;lQUY#2Oy$DfW@%keq*tS9BUMjsJj?YSnGJO
zwgVD%95_<x0g0&@pBIGr;UQ5oHK4|}L>*!h_ljae0`kjKXmVGrQLn|^@pwgNkW)}V
z9rTXu2SnBus1n%y_>ZdFpM9YDzt(%s2oEVmHr5z6j0&gz&38{>6)4~+P<eSnmiE??
zOcgxwMSCV=LYky0k^7`9ihsC%)?JLPhXbjGrxa-%FSA4FrB?pLwTvkwA|ao~eU#dv
zm@lK+l(T3gUWmqK??u-$G3)NjeSaYZSEb=XcK0lOl))_S*RL*11#d&Z29+<(S1)|u
z(KM@ErK*a0cF_n&xff^Z0g1``hqzqdD($pcD7&J&2u2|PtV8nk1x2i11AXTogUbKR
zd0o{qmm^8^=bH^dMn!*#u>3eiW~Mi;EN?n5@~s)^jO0Tc0Askycd2XdaNF~7YLq%C
zbqJ`*T`|Zz%v!OtvBs}){KnOY+VsEEUu0@WcSQ8iroZ>Kc)EBsN6Mm9gU*w)0oCz>
zwI%J>KUf5QMnh6xN%}+U8D#}VHhzCL{;an@A8iJ4@h;zQ3Ah-pdUp~08wGooJy+g7
zU*e|H$OIpESW?2t<r|kgsmC2W?Z+KR)2=i8MbP+d>ezlyTO`Op4B1a-W`?AwKr6=l
z^roHr*6}f5A6Jp&y}yW)&ijk-knpT)gxQ_H+waNN5%r>HTeYoDe;_A)MQ!ZkrauCr
zagSlB)29m%Z5b(ttJlZHNsZ!%q=sJa0i?KmCB3%i*WZ>q>|iE=`|8QPC?drDDGb)L
z!M7R=Db=T}l||M)Nx`t2hhPwiW~$M%StvaRv`3q`OQJ&~O;n2t@gq@3WRAFiLxH-}
zu{T9P>A)ex+Xr?Uiat63V?>&<H3ldNWDp51Nmh}o>r;l6l1l<My@P^>_>N#utgGzs
zjluajSfu}6c$RCt$-X|(_3B~kH6={9WN`PBFy-jwApiquDJv>^@Vl!-#Jp}yGrn5C
z;HjInL?0q+4+SC5jIGHY^)Ok&6+Ig+syVjeNt8+8gOo0bgLkMhi%XY|VG?6*f2s)Q
zL-l@pk@QO1uki+9#D{G32@Xik{6sO@@S2cD_>a($q$|yvDm%&D2i;n=r*m`nVobG$
z!QDa*N3W4ITf)d(_et3l^6Ipzdfxm90XJiTxssiEl=lsYTDT_%C%HJ=eLy63&C<_g
z-i|l_H4E7&OjOlu->GvyqIoy*obQIVvp0;=BZZVNlS7mO43x?IK&qyTK{9H{{d-U_
zdZy99u!@&w-QGLmMHBSK0xW#R<bmuI=FdR9YL~r20x4xMq(J-Ko%HedbCr=W78P+$
zqpnNH&?`^6|5R~p&_;}5OEwTI_q9orZ$qbbtb#my2Ly2C97Kmj(TNM2#i5sBhA3k3
zG0~WJI|j9&Cp>zZ&*EAA)-<~C2#PFrF#(^`B20(3K15#O8S_p`R#^mHfF{Ok)JLK0
zjG=PhRaiy)<TKeYlkIp`v`T}v5uss{chsG!8uNfK@AP_0!%ADNX}edFT*E;t(}i?l
z9d}7fZp;TBxcBx1ne=`%{_dfPa9BfIN-x@upN<yl`um9N?lAJv4?eV~eP+X-EBTWI
zE#+dm$?=YLeeL;uq1IC=Z1M9}ERR9C!a!NLR+QlzLqCJZ$RT+mWtVR;z6PM}6^>Z3
z=A_cBujL8sL653by+NuMof`_>b3=n^aI%D?OeY@qUdboSHY-q0hbPBfEvL%J>(X`)
z2&QID@G1voVy<VoRnDS~?6t4D%-C;+%NJfhF_BB9eK98Xw=+rsQ>onh;IFXbcL*{F
z$H{!LzVWLq0rm*x11lfo_&z(&ug%@OA2?Wvx(+H?Kk&A@ZNdt8r`?RDWMINNmfexV
z`K;Wd$CHCj#QWegYx>_Ca;!b}ivZP)9x-C^gi{(n7}6!}6iY{pPJM0miBpj{WrEQ-
zJC4V>zW{s~&nq@X*2WVYtCNf(*Y5;q!fq302Yxn}-q$q&cU5^o&*j#mdo7%qR;Bj0
zk|dw6&@Nfj_>-p^2pQUuDx`+`U+f9>49BS4l+w17ET&X@%GJl&p!kp^i`)xT_#1OQ
zF~9GL#VFppf5Kii{T*x$^10)tS@92J`~HMHJp<_>c7K%l$_#>y_uXP#J~J-4!oB4z
z^zg4eMt}GHa9hpFb@G>OI!)<;Mafg2l6?6zX5eTn@4PQEb_N8(a3tQ{)bDZ2F~5Jr
z5sftmjr;4eRIpdMeLrdm<dkkVFd#<mV^N&Ss0*A#IevRqzgmN{(uA@T!v-=8QjzP3
zNO%rNT+ot;-`3sPOuvNybx1Xfk<DZ)CK1P?URZ)xzza34Fzv^h9|H%}90K|LnwK$;
zMGRhu|2lz^5h>G$9`$4usFz!E&3*~D>S%smQOfkh*(bBuD&0-`2g_53$@EbtHxM@X
z`?%=E$n@cqKCn%FdB*M#GQcT{9xTUrDm^|_mU@x0H_{`nHc9sBWPhy8qEzu2yZD*p
z4(X$Nn0L+wJmWXm+Usw7-u(AI<(Q_?i#4m;#Q&SJcLnW=nP>c8x7T=RKcDQwjbOc-
zBC_+UZKzl7c?->fEBzrTqVzfP(s4O9i^eh{dL@V919vlP-bUL6-%8TzgIooZM>HP=
z3Unx*u-2c&5Su*yCST*o7dx<FRO@E_bV8_d4PtvKJ?@d8EJ2LFyyulfs(=~LtP&z}
zow}@FJ}+u=`IyfDP9{PU>1yE6rZyPvYr~&7cF(9UL<#f{dVgPReow1At6w5L=i!wK
zm*Ka3rVIMHh**ivU8nKqnQp|}SwyA`r!FiMK0$KuT??KUJ*bkaA3X*Qy-MV!&DNVi
z_%ICYYN<Le)&z4IL^J9k@nZzPIp89GtC4d9xlMmsf#cSgkHn4H25S7yx9MISkpCk5
zs$P?7U|jU*J1FMbU`-+g=C86Gor}2X&WRl!<Igab!B(6R>);73k9rt?mUKv}**tNt
zTm!p>$Gf@A<hQj$VZ%$&d*DL+Sx9H%3<pCkZ3ssUUEonybl|TzY$Fm^{V|(-6>XE8
zqJ6<orj+Ve{Wcswrt{<`4tFtChdzo#+C|p8*`Tc4T*wz94&F~>l&eZ2aj0%Anv{eq
zK$s99md6-BKz0yzftaW1xZET%u$vcTqopvzEn)L~^q{mkaDqe5a9@OORKh6~I?<5=
zay!M|Fb{?x+G*r`Pp*4)I$KKzZyI2SG4gC#2ciZ*`q~!@^`2XgQX*UjKLM=Cf#^gH
zR{%jD+>Cz${lvD(V8*hxCrP8D%Yx)ns+dWo%J-lRLE^G;H+Vxu1Dw1q+NGpF6TZ|>
zWBa+H><nt<E2j~u_)A4xjhl0qON<BuK?ooljJA50SS>H5T|OL_=c|y~owP)I5f7OF
z<eWh!Q?P)JivFx;x@5S02H*rfxO)^()lRSH6MfI`jNIdM^F17ja1De!6%z5vCX&-L
zbbBBs(p~SG(NG!QxXsnibSXt=JGGoYi+w<1E6Sv&^iutApihKJTk?JeVMC^ld;5A)
zQp+-*Cy7aPLEx1+iAS*3I(U-Cg|8IY8nZtx;Rw)s|Lf}0t6k;RJ==%K@d3w>F|oXZ
z<N=I(JjyHfNe5W^H}tY`heNJZ1?E<n_lTpuL-zZRt8RL7%aNT0o*pg~vIVXN?86e>
zE&7$Qzk!mvBY9I`SAtbtv`Wgci_E|cpl~*i$`B57#vWJ0*fO__KNybP^qT254mY&B
z0+(Z@Pm@()3_4;&=OASvn2UYlcu4-Vdwx%b`lQoH#8nDBW(Le(Q#ub;!`nvG4pGq=
zx}TR~=brw-dE}1CagxDlu1!fuFwj14TdM!wqz-N1^toq%B1W6h8aPs|d~tt)CQgU5
z0($o!AyfDg8~!%q^Bdw}k%NdqdGF)xA{(_N{z&GHkMpSxg4Qsm>$c%Yh*ZQfqYH2x
z+#gAw_`<>D>iqr}SP8Su!G)zg@Fi*;gj+g<@v4XQ9-A|dQkFptOyu#HLiWv9%G3aE
z|CVvV7zkSog0+GxjCVx%9q7Lq(Y1wcseL}^NGC-e3e2R>6Y?-grLA4JHkUbjS2KfG
z61v*~@maesDIF?s@!OCC%Z?f>I`BqaYC}<%<;upbBhNtKil)Yp3Q8?RTdRYC__lT1
zG9dbIzQc5M-(a%%6xE1ScufZMU{W)KAEuSI@iu%^auj6uQ%Y&eyh9cR0%R(gUdI(1
zo2cC4)c7p*iZscA3zy$c{=@J42SMQYj&nM-T-}E{hV|;WJH_DJslkdk!QL(oIt~kZ
z{l$S)A(fYN7Q_RUfg6F8>#*wOxHc*z42xpnSH(#(GTxq~36NH)NaU6ILq(5B<>yh2
zZTyvh<yHw!#g!v=4w6;<XhDU9yJ$g-vuM~63}UJ{WyN9l?ZenCSd65qLP=Gw7IzMG
z+w6S$D~ZI%9+KocJ}!D6{dlw-w=;QywN2=$nL!XqoOP`WlHf^hpjRugDA7Pub{0ax
zYG2|DGrn}s9r@m|ms|#K3@;>zqi0fQIZ`t{N*0}M;NERoYuGax4JxLULuYdG3BPF7
zkUBslzCtqbEq+egF~o;^cZ9>7gyzSD{6$ON)y~5sjUW_>9EyPXHlxANa2(HY(j<O`
z_cv{}3~Yi<{r-v%i32lACwC)`v{M(5Yy(Xi^x!LI$^7!d{-v~*mz_I}sF@P3b<)_9
z0+ZnaxfPN#E2*=pONt)|K3A>e=Jec6(g^$T2r$wjl4&kIW9b`LKFun-Ag(IVs_SzH
zbX}GuC`${@P5T0|H_l(0$xy`|9fpdAFyBWLo8vv+_2rMUvrsF%j4`ZW7Mo}qkAoJ>
zJ8zwpb`UVw8HK=$(cS5`W;dy2ZLsL9827EtIPVo!c?FH`cUW`N?|mAQ_|La|?ZpLw
z;m7Q&2U`n8<XHy6+@_7jN}r{OgBB%@q(5LPQB&ipm~58QCmcgA*=N1-YJp_N%E7-q
z!7E*_!Q_@I<oH$aUaA*ZlOJVZ`<5kz>3wd|$B7Q}VdZ1~;?Q_!ps=i0*@v?YQ_O4I
zmK>8yvfTSKKdXWmgAx7ZHjL3ERAG+aN_d$FcGgYTFuf?_?bJp_mm<=qK^xXo6szQE
zi<jg?@iBO5rG4wm)5V{>!!z7BhL|aL16jRgOTANo4+0-xVP9IoaizzcU{GMWl^|DP
z?bn0jJ+U>Dh1N$!NW<jxG?%L{fKKl%2|8hP>XfXI@93eW&&DA1pd~1{dSR&4&s1q6
zcq?)8M9NlVOgH0m1=K!qsIqHUm9~+QJnzyTw%VtT@Rw)K`;D6vm9t6uaC_SjQ-=TP
z?uuZnHz{*>(a=kr^!DPjd<?&oKVK^~>e86*v>Qv$@gx{CV_jX`L83kTP-m3HV_qX2
z=%fUUd})IYTvy7t+6y%?E9S%PyzTGZmgM5FZc^T|>f)bMN?%F1z77iEBJ0-_g&S;N
zvt!hVyso-2ijrm}e-88?``r3Wm6Rf!mQ4LqS2ALM#N)2wQNh&hHfs+frCzAM5EAuj
z)9rm(dsO&TCvpH~jhTN0TbAuD*LVeboJy&rs3=qVydm$Uwu~bJ-I<qfrLSAYilX16
z^mjX3kZ=iN#^n|34Khn`mALTbE|~$f&ZS+^*EK5xtZz~ZCadS-*?Z)}B_ti5OAr=a
zU~G87Ta?6B(W~*`v05!eNxV@r$~~^Jp~az9oo8a-_b_H}{!N>lx~r|%?`vn!Xhp0s
z8$^5k8e2EN%}pb{)I?sr&X}80tNiiQOhFvcqcJ>YhiVz3g_?H~d7nw|U3aRe^ksQh
z;k)tO^iXq6>eH&5-^%S<X-=*i)@ihC*VuS3&L)@d&#h|o_RPk}=*uIcHDRb+L%4kU
z1NZlAKb@HfTYIIJcQh#ROEE8BvfHGm>PaYbhp>bO`ijd@!`%JPsZ|s;>!&Oa=Hxm_
zsy~sUiL;Zuyzd-aZQX9&Ro!p{e^#uW&*nN^MWT7J@UBOAqBoJsr=%#oPjX58o<GQr
z%z!y6`z>tFMxem^wkO$<LufJJNS&cjt9A-yjO|^~X|N_`v)S`4qsB27p~a*y@vb)i
zb^lnF%Ud(uM`3XJ)!O~U0ya7(bDIT7!pUj0&Q24u<d)b<x>w_3Zcj3KhNgSr0$PQ#
zf~=1AJ0hMl$fZW8IRVCLbk#L7XX*j3ZyrkD71CEoSru(x_Pv?7TV~+sC4N5iKf*-6
z(JY2O$|1ow`Ck^&mH}M|pSOQ_;vjQZXtfGF>5S7&U7gc^jA{M*Ly!@F@9ck#_?MR$
zA8v)La{nV7A3er&vootVX#MH+a}s;y<X_yWEg`1xEaTQ^^FSj@K`Fd{v-^#}XFN=g
zQ;b$IfWn7&UIlx|nBm^A()lbe^HR3?BI;%BsFddU>kaW?vM+I$_Ceg|(>RZ)FpZzN
z|3Ck+Cfe>1gJ0!XzW0A#0ZxqAXLzNiXVqDA*h?5t+0K(zxiNK~ogDYPFA?Hkh(6zc
zVq|()GSMHb)|s}h`#pGY&~kVI)+S^RGQ{?G`nJ1PkZrJ&QB@+q!GHvGA^Tzv%it=U
z|D7nyj>i3<&ku6l(2oA+A7S`Aoh^NlBv_f!P^xg-p}t~K;%e%1`TT8HN}!)%<36~o
VbJ;o8^#c4+zNN14PR=ag{{qS|ao+#{

literal 0
HcmV?d00001


From a94dddd2e11159c5eb30f5abe127231f8756e5b3 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 16 Aug 2025 13:16:34 +0800
Subject: [PATCH 126/233] [XPU]avoid circular import during XPU init (#23017)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/platforms/xpu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 66ebc8ad9d22..af24437f649f 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -7,7 +7,6 @@
 import torch
 
 import vllm.envs as envs
-from vllm.config import CUDAGraphMode
 from vllm.logger import init_logger
 from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
@@ -105,6 +104,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             and not cls.device_support_bf16():
             model_config.dtype = torch.float16
 
+        # lazy import to avoid circular import
+        from vllm.config import CUDAGraphMode
         compilation_config = vllm_config.compilation_config
         if compilation_config.cudagraph_mode is None or \
                 compilation_config.cudagraph_mode.max_cudagraph_mode() \

From f4e73d0f4070a3ae394b1806effd82f708518093 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 16 Aug 2025 01:36:27 -0400
Subject: [PATCH 127/233] [Build] Env var to disable sccache (#22968)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 919300e143c1..cc3037ebb72c 100644
--- a/setup.py
+++ b/setup.py
@@ -60,7 +60,8 @@ def load_module_from_path(module_name, path):
 
 
 def is_sccache_available() -> bool:
-    return which("sccache") is not None
+    return which("sccache") is not None and \
+        not bool(int(os.getenv("VLLM_DISABLE_SCCACHE", "0")))
 
 
 def is_ccache_available() -> bool:

From 71d2a2bdea5432f9179f7042975b122d7ff51d3b Mon Sep 17 00:00:00 2001
From: Andrew Sansom <andrew@protopia.ai>
Date: Sat, 16 Aug 2025 01:25:10 -0500
Subject: [PATCH 128/233] [BugFix] Add support for loading prompt embeds
 tensors serialized on unavailable devices and sparse tensors (#22962)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
---
 .../openai/test_prompt_validation.py          | 49 +++++++++++++++++++
 vllm/entrypoints/openai/serving_engine.py     |  6 ++-
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index e31a1d077608..4197583074df 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -1,10 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import io
+
 # imports for guided decoding tests
 import openai
+import pybase64
 import pytest
 import regex as re
+import torch
+
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
 
 from ...utils import RemoteOpenAIServer
 
@@ -42,3 +48,46 @@ async def test_out_of_vocab_token_ids():
                                             prompt=[999999],
                                             max_tokens=5,
                                             temperature=0.0)
+
+
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.bfloat16, torch.float16])
+@pytest.mark.parametrize(
+    "layout",
+    [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr])
+@pytest.mark.parametrize("seq_len", [2, 10])
+@pytest.mark.parametrize("hidden_size", [2, 10])
+def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
+                            seq_len: int, hidden_size: int):
+    # construct arbitrary tensors of various dtypes, layouts, and sizes.
+    # We need to check against different layouts to make sure that if a user
+    # uses sparse tensors to reduce the transmission size of prompt embeddings,
+    # we must cast them to dense/strided before passing them into the engine.
+    # We don't use non-CPU tensors in this test to avoid preemptively
+    # initializing cuda and break other tests in the suite that fork processes.
+    # We also need to make sure that we only use devices that are actually
+    # available in the environment the test is running on. For simplicity,
+    # we just test against CPU.
+    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
+    if layout == torch.strided:
+        tensor = tensor.contiguous()
+    elif layout == torch.sparse_coo:
+        tensor = tensor.to_sparse_coo()
+    elif layout == torch.sparse_csc:
+        tensor = tensor.to_sparse_csc()
+    elif layout == torch.sparse_csr:
+        tensor = tensor.to_sparse_csr()
+
+    buffer = io.BytesIO()
+    torch.save(tensor, buffer)
+    buffer.seek(0)
+    encoded_tensor = pybase64.b64encode(buffer.getvalue())
+
+    loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor)
+    assert len(loaded_prompt_embeds) == 1
+    loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
+    assert loaded_tensor.device.type == "cpu"
+    assert loaded_tensor.layout == torch.strided
+    torch.testing.assert_close(loaded_tensor,
+                               tensor.to("cpu").to_dense(),
+                               equal_nan=True)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d6f92a63301e..0f4a7c0186b6 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1006,8 +1006,8 @@ async def _generate_with_builtin_tools(
             # OPTIMIZATION
             priority = orig_priority - 1
 
+    @staticmethod
     def _load_prompt_embeds(
-        self,
         prompt_embeds: Optional[Union[bytes, list[bytes]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     ) -> list[EmbedsPrompt]:
@@ -1015,12 +1015,14 @@ def _load_prompt_embeds(
         def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
             tensor = torch.load(io.BytesIO(
                 pybase64.b64decode(embed, validate=True)),
-                                weights_only=True)
+                                weights_only=True,
+                                map_location=torch.device("cpu"))
             assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
                 torch.float32,
                 torch.bfloat16,
                 torch.float16,
             )
+            tensor = tensor.to_dense()
             if tensor.dim() > 2:
                 tensor = tensor.squeeze(0)
                 assert tensor.dim() == 2

From 252a427298339868f0378bb81b89c2b57dbe877d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 16 Aug 2025 15:26:10 +0800
Subject: [PATCH 129/233] [Misc] Add --save-dir option to benchmark_moe
 (#23020)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 13bf1be836f6..b4a03665ef10 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -3,6 +3,7 @@
 
 import argparse
 import json
+import os
 import time
 from contextlib import nullcontext
 from datetime import datetime
@@ -542,6 +543,7 @@ def save_configs(
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
     block_quant_shape: list[int],
+    save_dir: str,
 ) -> None:
     dtype_str = get_config_dtype_str(
         dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@@ -552,7 +554,8 @@ def save_configs(
     filename = get_config_file_name(
         num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
     )
-
+    os.makedirs(save_dir, exist_ok=True)
+    filename = os.path.join(save_dir, filename)
     print(f"Writing best config to {filename}...")
     with open(filename, "w") as f:
         json.dump(configs, f, indent=4)
@@ -707,6 +710,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
             use_fp8_w8a8,
             use_int8_w8a16,
             block_quant_shape,
+            args.save_dir,
         )
         end = time.time()
         print(f"Tuning took {end - start:.2f} seconds")
@@ -748,6 +752,9 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
         "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
     )
     parser.add_argument("--use-deep-gemm", action="store_true")
+    parser.add_argument(
+        "--save-dir", type=str, default="./", help="Directory to save tuned results"
+    )
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, nargs="+", required=False)
     parser.add_argument("--tune", action="store_true")

From b29cac8322e912ed6587d850512ebfabdeac3116 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 16 Aug 2025 15:44:50 +0800
Subject: [PATCH 130/233] [Multimodal] Update Tensor schema test to cover
 arbitrary shape mm inputs (#22867)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/multimodal/test_tensor_schema.py | 143 +++++++++++++++---
 vllm/model_executor/models/keye.py            |  22 ++-
 2 files changed, 138 insertions(+), 27 deletions(-)

diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index 92390d8c2f7e..036624431c20 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -1,17 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
 from functools import partial
+from typing import Any, Union
 from unittest.mock import patch
 
+import numpy as np
 import pytest
+from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
+                                                       UserMessage)
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from PIL import Image
 
 from vllm.config import ModelConfig
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
 from vllm.inputs import InputProcessingContext
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
+                             MultiModalKwargs)
 from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils import GiB_bytes, set_default_torch_num_threads
+from vllm.utils import GiB_bytes, is_list_of, set_default_torch_num_threads
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
@@ -23,12 +32,64 @@
     "MolmoForCausalLM": "incompatible requirements",
     "MiniMaxVL01ForConditionalGeneration": "broken model",
 }
+ARCH_NEEDS_EXTRAS = [
+    "InternVLChatModel",
+    "Idefics3ForConditionalGeneration",
+    "LlavaForConditionalGeneration",
+    "MiniCPMV",
+    "PaliGemmaForConditionalGeneration",
+]
+REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"}
+
+ImageInput = list[Image.Image]
+VideoInput = Union[list[Image.Image], list[np.ndarray],
+                   list[tuple[np.ndarray, dict[str, Any]]]]
+AudioInput = list[tuple[np.ndarray, int]]
+
+
+def _resize_data(_data: Union[Image.Image, np.ndarray],
+                 size_factor: float) -> Union[Image.Image, np.ndarray]:
+    assert size_factor <= 1, "Size factor must be less than 1"
+    # Image input
+    if isinstance(_data, Image.Image):
+        W, H = _data.width, _data.height
+        W, H = map(lambda x: int(x * size_factor), (W, H))
+        return _data.resize((W, H))
+    # Video input with PIL Images
+    elif is_list_of(_data, Image.Image):
+        W, H = next(iter(_data)).width, next(iter(_data)).height
+        T = len(_data)
+        T, W, H = map(lambda x: max(int(x * size_factor), 1), (T, W, H))
+        return [d.resize((W, H)) for d in _data[:T]]
+    # Video input with numpy arrays
+    elif isinstance(_data, np.ndarray) and _data.ndim >= 4:
+        T, H, W, C = _data.shape[-4:]
+        T, H, W = map(lambda x: max(int(x * size_factor), 1), (T, H, W))
+        return _data[..., :T, :H, :W, :C]
+    # Audio input
+    elif isinstance(_data, np.ndarray) and _data.ndim == 1:
+        return _data[:int(len(_data) * size_factor)]
+    raise AssertionError("This line should be unreachable.")
+
+
+def resize_mm_data(
+    data: Union[ImageInput, VideoInput, AudioInput],
+    size_factors: tuple[float,
+                        ...]) -> Union[ImageInput, VideoInput, AudioInput]:
+    size_factors = size_factors[:len(data)]
+    if is_list_of(data, (Image.Image, np.ndarray, list)):
+        return [_resize_data(d, s) for d, s in zip(data, size_factors)]
+    elif is_list_of(data, tuple):
+        return [(_resize_data(d, s), meta)
+                for (d, meta), s in zip(data, size_factors)]
+    raise ValueError("Unsupported multimodal data type.")
 
 
 def create_batched_mm_kwargs(
     model_config: ModelConfig,
     processor: BaseMultiModalProcessor,
-) -> MultiModalKwargs:
+    size_factors: tuple[float, ...] = (1.0, 0.5, 0.25),
+) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
     processing_info = processor.info
     dummy_inputs = processor.dummy_inputs
     supported_mm_limits = processing_info.get_supported_mm_limits()
@@ -40,30 +101,69 @@ def create_batched_mm_kwargs(
         seq_len=model_config.max_model_len,
         mm_counts=mm_counts,
     )
+    mm_data = processor_inputs.mm_data
+    resized_mm_data = {
+        modality: resize_mm_data(data, size_factors)
+        for modality, data in mm_data.items()
+    }
+    # Mistral chat outputs tokens directly, rather than text prompts
+    if model_config.tokenizer_mode == "mistral":
+        images = resized_mm_data.get("image", [])
+        request = ChatCompletionRequest(messages=[
+            UserMessage(content=[
+                TextChunk(text=""),
+                *(ImageChunk(image=image) for image in images),
+            ]),
+        ])
+        tokenizer = processing_info.get_tokenizer()
+        res = tokenizer.mistral.encode_chat_completion(request)
+        prompt = res.tokens
+    else:
+        prompt = processor_inputs.prompt
     mm_kwargs = processor.apply(
-        prompt=processor_inputs.prompt,
-        mm_data=processor_inputs.mm_data,
+        prompt=prompt,
+        mm_data=resized_mm_data,
         hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         tokenization_kwargs=processor_inputs.tokenization_kwargs,
     )["mm_kwargs"]
-    mm_kwargs = MultiModalKwargs.batch([mm_kwargs])
-    return mm_kwargs
+    items = [
+        item for modality in supported_mm_limits
+        for item in mm_kwargs.get_items(modality)
+    ]
+    return group_mm_kwargs_by_modality(items)
+
+
+def get_model_id_to_test(
+        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
+    filtered_results = []
+    for model_arch in model_arch_list:
+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+        if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
+            available_repos = list(
+                map(lambda model_id: (model_arch, model_id),
+                    [model_info.default, *model_info.extras.values()]))
+            filtered_results.extend(available_repos)
+        else:
+            filtered_results.append((model_arch, model_info.default))
+    return filtered_results
 
 
 @pytest.mark.core_model
-@pytest.mark.parametrize("model_arch", list(_MULTIMODAL_EXAMPLE_MODELS.keys()))
-def test_model_tensor_schema(model_arch: str, vllm_runner: type[VllmRunner],
-                             monkeypatch):
+@pytest.mark.parametrize(
+    "model_arch, model_id",
+    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
+def test_model_tensor_schema(model_arch: str, model_id: str,
+                             vllm_runner: type[VllmRunner], monkeypatch):
     if model_arch in ARCH_TO_SKIP:
         pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
+    if model_id in REPO_ID_TO_SKIP:
+        pytest.skip(f"Skipping {model_id} due to {REPO_ID_TO_SKIP[model_id]}")
 
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip",
                                           check_max_version=False)
 
-    model_id = model_info.default
-
     hf_overrides_fn = partial(dummy_hf_overrides,
                               model_arch=model_arch,
                               exist_overrides=model_info.hf_overrides)
@@ -119,6 +219,7 @@ def _initialize_kv_caches_v1(self, vllm_config):
         if model_info.v0_only:
             m.setenv("VLLM_USE_V1", "0")
 
+        # TODO(Isotr0py): Can we avoid initializing engine?
         with (
                 set_default_torch_num_threads(1),
                 vllm_runner(
@@ -145,12 +246,16 @@ def _initialize_kv_caches_v1(self, vllm_config):
                 mm_registry = llm_engine.input_preprocessor.mm_registry
 
             processor = mm_registry.create_processor(model_config)
-            mm_kwargs = create_batched_mm_kwargs(model_config, processor)
 
-            def validate_model_input(model):
-                for modality in ("audio", "image", "video"):
-                    method_name = f"_parse_and_validate_{modality}_input"
-                    if hasattr(model, method_name):
-                        getattr(model, method_name)(**mm_kwargs)
+            def validate_model_input(model, modality: str,
+                                     mm_kwargs: MultiModalKwargs):
+                method_name = f"_parse_and_validate_{modality}_input"
+                if hasattr(model, method_name):
+                    getattr(model, method_name)(**mm_kwargs)
 
-            vllm_model.apply_model(validate_model_input)
\ No newline at end of file
+            for modality, _, mm_kwargs in create_batched_mm_kwargs(
+                    model_config, processor):
+                valid_func = partial(validate_model_input,
+                                     modality=modality,
+                                     mm_kwargs=mm_kwargs)
+                vllm_model.apply_model(valid_func)
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 40c66c226850..db9ed5910d78 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -30,7 +30,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargs, VideoItem)
@@ -44,6 +44,7 @@
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
+from vllm.utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@@ -112,8 +113,9 @@ class KeyeImagePixelInputs(TensorSchema):
         - g: Grid dimensions (3 for t, h, w)
     """
     type: Literal["pixel_values"]
-    pixel_values: Annotated[torch.Tensor,
-                            TensorShape("b", "np", 3, "ps", "ps")]
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})]
     image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
 
 
@@ -145,8 +147,9 @@ class KeyeVideoPixelInputs(TensorSchema):
         - g: Grid dimensions (3 for t, h, w)
     """
     type: Literal["pixel_values_videos"]
-    pixel_values_videos: Annotated[torch.Tensor,
-                                   TensorShape("b", "np", 3, "ps", "ps")]
+    pixel_values_videos: Annotated[
+        torch.Tensor,
+        TensorShape("b", "np", 3, "ps", "ps", dynamic_dims={"np"})]
     video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
 
 
@@ -1295,7 +1298,7 @@ def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
             return None
         return quant_config
 
-    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+    def _validate_and_reshape_mm_tensor(self, mm_input: NestedTensors,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
             raise ValueError(f"Incorrect type of {name}. "
@@ -1310,8 +1313,11 @@ def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                  f"Got ndim: {mm_input.ndim} "
                                  f"(shape={mm_input.shape})")
             return torch.concat(list(mm_input))
-        else:
-            return torch.concat(mm_input)
+        elif is_list_of(mm_input, torch.Tensor):
+            if all(p.dim() == 4 for p in mm_input) or all(p.dim() == 2
+                                                          for p in mm_input):
+                return mm_input
+        return torch.concat(list(mm_input))
 
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[KeyeImageInputs]:

From 7c7c0fd70aee73ac6dc2d83fcf5387311401e27a Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Sat, 16 Aug 2025 00:46:00 -0700
Subject: [PATCH 131/233] [Core] Make cudagraph check cuda platform only
 (#23005)

Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
---
 vllm/config/__init__.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 280ae60c91ff..72fec5e205e3 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3535,15 +3535,6 @@ def __post_init__(self):
                 # in V0 means the compilation level wins out.
                 self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
-        # if cudagraph_mode is not explicitly set by users, set default value
-        if self.compilation_config.cudagraph_mode is None:
-            if envs.VLLM_USE_V1 and self.compilation_config.level \
-                == CompilationLevel.PIECEWISE:
-                self.compilation_config.cudagraph_mode = \
-                    CUDAGraphMode.PIECEWISE
-            else:
-                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
@@ -3552,14 +3543,28 @@ def __post_init__(self):
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             self.compilation_config.custom_ops.append("+rms_norm")
 
-        # disable cudagraph when enforce eager execution
-        if self.model_config is not None and self.model_config.enforce_eager:
-            logger.info("Cudagraph is disabled under eager mode")
-            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-        elif envs.VLLM_USE_V1:
-            self.compilation_config.cudagraph_num_of_warmups = 1
+        if current_platform.is_cuda_alike():
+            # if cudagraph_mode is not explicitly set by users, set default
+            # value
+            if self.compilation_config.cudagraph_mode is None:
+                if envs.VLLM_USE_V1 and self.compilation_config.level \
+                    == CompilationLevel.PIECEWISE:
+                    self.compilation_config.cudagraph_mode = \
+                        CUDAGraphMode.PIECEWISE
+                else:
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
-        self._set_cudagraph_sizes()
+            # disable cudagraph when enforce eager execution
+            if self.model_config is not None and \
+                    self.model_config.enforce_eager:
+                logger.info("Cudagraph is disabled under eager mode")
+                self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+            elif envs.VLLM_USE_V1:
+                self.compilation_config.cudagraph_num_of_warmups = 1
+
+            self._set_cudagraph_sizes()
+        else:
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
         if self.cache_config.cpu_offload_gb > 0 and \
             self.compilation_config.level != CompilationLevel.NO_COMPILATION \
@@ -3618,7 +3623,7 @@ def __post_init__(self):
         current_platform.check_and_update_config(self)
 
         # final check of cudagraph mode after platform-specific update
-        if envs.VLLM_USE_V1:
+        if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
             if self.compilation_config.cudagraph_mode == CUDAGraphMode.FULL \
                 and self.model_config is not None and \
                 not self.model_config.disable_cascade_attn:

From 0fc05d2a49022c780d8f4f6a0ce9b118973f6a4e Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 16 Aug 2025 17:44:19 +0800
Subject: [PATCH 132/233] [CI][Bugfix] Skip Ovis2 generation test because of
 broken remote code (#22954)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/registry.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3efc9a99ea41..10e29e01e8a1 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -196,7 +196,8 @@ def check_available_online(
                                        {"alias": "gpt2"}),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
                                              extras={"tiny": "bigcode/tiny_starcoder_py"},  # noqa: E501
-                                             min_transformers_version="4.55.1"),
+                                             min_transformers_version="4.55.1",
+                                             transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
     "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
                                        {"6b": "EleutherAI/gpt-j-6b"}),
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
@@ -408,14 +409,16 @@ def check_available_online(
                                       extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
                                       max_transformers_version="4.48",  # noqa: E501
                                       transformers_version_reason="HF model is not compatible."),  # noqa: E501
+    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
+                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},    # noqa: E501
+                                                        min_transformers_version="4.55.1",
+                                                        transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B",
                                                  "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                          trust_remote_code=True),
     "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
                                          trust_remote_code=True),
-    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
-                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -455,6 +458,8 @@ def check_available_online(
     "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
                                                      trust_remote_code=True),
     "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
+                            max_transformers_version="4.53",
+                            transformers_version_reason="HF model is not compatible",  # noqa: E501
                             extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
@@ -482,7 +487,9 @@ def check_available_online(
     "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"),  # noqa: E501
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B",
                                            trust_remote_code=True),
-    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
+    "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct",  # noqa: E501
+                                                       min_transformers_version="4.55.1",
+                                                       transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
     "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
                                                         trust_remote_code=True,
                                                         is_available_online=False),

From 1a2838c32dff971f24652da7917316556c585a0b Mon Sep 17 00:00:00 2001
From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Date: Sat, 16 Aug 2025 03:21:20 -0700
Subject: [PATCH 133/233] Add docs for PrefixRepetitionDataset + enable usage
 with `vllm bench throughput` (#23012)

Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 benchmarks/README.md          | 22 +++++++++++++-
 vllm/benchmarks/throughput.py | 57 ++++++++++++++++++++++++++++++++---
 2 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index caff8f034214..1d715a193ea1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -40,7 +40,7 @@ become available.
       <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
     </tr>
     <tr>
-      <td><strong>Sonnet</strong></td>
+      <td><strong>Sonnet (deprecated)</strong></td>
       <td style="text-align: center;">✅</td>
       <td style="text-align: center;">✅</td>
       <td>Local file: <code>benchmarks/sonnet.txt</code></td>
@@ -51,6 +51,12 @@ become available.
       <td style="text-align: center;">✅</td>
       <td><code>synthetic</code></td>
     </tr>
+    <tr>
+      <td><strong>Prefix Repetition</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>synthetic</code></td>
+    </tr>
     <tr>
       <td><strong>HuggingFace-VisionArena</strong></td>
       <td style="text-align: center;">✅</td>
@@ -592,6 +598,20 @@ python3 benchmarks/benchmark_prefix_caching.py \
   --input-length-range 128:256
 ```
 
+### Prefix Repetition Dataset
+
+```bash
+vllm bench serve \
+  --backend openai \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-name prefix_repetition \
+  --num-prompts 100 \
+  --prefix-repetition-prefix-len 512 \
+  --prefix-repetition-suffix-len 128 \
+  --prefix-repetition-num-prefixes 5 \
+  --prefix-repetition-output-len 128 
+```
+
 </details>
 
 ## ⚡ Example - Request Prioritization Benchmark
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index fdf6548ada5b..0c19fa6dcfdd 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -18,9 +18,11 @@
 
 from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
                                       ConversationDataset,
-                                      InstructCoderDataset, RandomDataset,
-                                      SampleRequest, ShareGPTDataset,
-                                      SonnetDataset, VisionArenaDataset)
+                                      InstructCoderDataset,
+                                      PrefixRepetitionRandomDataset,
+                                      RandomDataset, SampleRequest,
+                                      ShareGPTDataset, SonnetDataset,
+                                      VisionArenaDataset)
 from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
                                        write_to_json)
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
@@ -327,6 +329,12 @@ def get_requests(args, tokenizer):
             dataset_cls = AIMODataset
             common_kwargs['dataset_subset'] = None
             common_kwargs['dataset_split'] = "train"
+    elif args.dataset_name == "prefix_repetition":
+        dataset_cls = PrefixRepetitionRandomDataset
+        sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len
+        sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len
+        sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes
+        sample_kwargs["output_len"] = args.prefix_repetition_output_len
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
     # Remove None values
@@ -356,7 +364,11 @@ def validate_args(args):
         raise ValueError(f"Unsupported backend: {args.backend}")
 
     # === Dataset Configuration ===
-    if not args.dataset and not args.dataset_path:
+    if (
+        not args.dataset
+        and not args.dataset_path
+        and args.dataset_name not in {"prefix_repetition"}
+    ):
         print(
             "When dataset path is not set, it will default to random dataset")
         args.dataset_name = 'random'
@@ -432,7 +444,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--dataset-name",
         type=str,
-        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        choices=[
+            "sharegpt", "random", "sonnet", "burstgpt", "hf",
+            "prefix_repetition"
+        ],
         help="Name of the dataset to benchmark on.",
         default="sharegpt")
     parser.add_argument(
@@ -521,6 +536,38 @@ def add_cli_args(parser: argparse.ArgumentParser):
                         default=None,
                         help="Split of the HF dataset.")
 
+    # prefix repetition dataset
+    prefix_repetition_group = parser.add_argument_group(
+        "prefix repetition dataset options")
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-prefix-len",
+        type=int,
+        default=None,
+        help="Number of prefix tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-suffix-len",
+        type=int,
+        default=None,
+        help="Number of suffix tokens per request, used only for prefix "
+        "repetition dataset. Total input length is prefix_len + suffix_len.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-num-prefixes",
+        type=int,
+        default=None,
+        help="Number of prefixes to generate, used only for prefix repetition "
+        "dataset. Prompts per prefix is num_requests // num_prefixes.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-output-len",
+        type=int,
+        default=None,
+        help="Number of output tokens per request, used only for prefix "
+        "repetition dataset.",
+    )
+
     parser = AsyncEngineArgs.add_cli_args(parser)
 
 
From 91ba5e6db7c2f086e2f00368b53f762b1af14fd3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 16 Aug 2025 19:30:49 +0800
Subject: [PATCH 134/233] [Refactor] Allow optional MultiModalKwargsItem in IPC
 (#23022)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/core/test_kv_cache_utils.py | 12 +-----
 tests/v1/core/test_prefix_caching.py | 12 +-----
 tests/v1/core/test_scheduler.py      | 12 +-----
 tests/v1/core/utils.py               | 12 +-----
 vllm/multimodal/inputs.py            | 62 ++++++++--------------------
 vllm/v1/engine/__init__.py           |  3 +-
 vllm/v1/engine/mm_input_cache.py     | 33 ++++++++-------
 vllm/v1/engine/processor.py          | 10 +++--
 vllm/v1/request.py                   |  7 +++-
 vllm/v1/worker/gpu_model_runner.py   |  4 +-
 10 files changed, 59 insertions(+), 108 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index e0b91e6dd7ee..47c74aff1e75 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -7,9 +7,7 @@
 import torch
 
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -42,13 +40,7 @@ def make_request(
     if mm_positions is None:
         mm_kwargs = None
     else:
-        mm_elem = MultiModalFieldElem(
-            modality="dummy_m",
-            key="dummy_k",
-            data=None,
-            field=MultiModalBatchedField(),
-        )
-        mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+        mm_item = MultiModalKwargsItem.dummy("dummy_m")
         mm_kwargs = [mm_item] * len(mm_positions)
 
     return Request(request_id=request_id,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 28cfca6767b1..89824768ed90 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -9,9 +9,7 @@
 import torch
 
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
@@ -37,13 +35,7 @@ def make_request(
     if mm_positions is None:
         mm_kwargs = None
     else:
-        mm_elem = MultiModalFieldElem(
-            modality="dummy_m",
-            key="dummy_k",
-            data=None,
-            field=MultiModalBatchedField(),
-        )
-        mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+        mm_item = MultiModalKwargsItem.dummy("dummy_m")
         mm_kwargs = [mm_item] * len(mm_positions)
 
     return Request(request_id=request_id,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index ac70c90d92ad..23762a0fb622 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -8,9 +8,7 @@
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -1328,13 +1326,7 @@ def create_requests_with_priority(
     for i in range(num_requests):
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_elem = MultiModalFieldElem(
-                modality="dummy_m",
-                key="dummy_k",
-                data=None,
-                field=MultiModalBatchedField(),
-            )
-            mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+            mm_item = MultiModalKwargsItem.dummy("dummy_m")
             mm_kwargs = [mm_item] * len(mm_position)
         else:
             mm_position = None
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 52093d3d381a..849c3f59ae52 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -6,9 +6,7 @@
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import (MultiModalBatchedField,
-                                    MultiModalFieldElem, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
                                          init_none_hash)
@@ -143,13 +141,7 @@ def create_requests(
     for i in range(num_requests):
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_elem = MultiModalFieldElem(
-                modality="dummy_m",
-                key="dummy_k",
-                data=None,
-                field=MultiModalBatchedField(),
-            )
-            mm_item = MultiModalKwargsItem.from_elems([mm_elem])
+            mm_item = MultiModalKwargsItem.dummy("dummy_m")
             mm_kwargs = [mm_item] * len(mm_position)
             mm_hashes = ["hash"] * len(mm_position)
         else:
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 0bbac45c121b..a33ce146995d 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -4,7 +4,7 @@
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
-from dataclasses import dataclass, replace
+from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
 from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
@@ -218,7 +218,7 @@ class MultiModalFieldElem:
     i.e. the name of the keyword argument to be passed to the model.
     """
 
-    data: Optional[NestedTensors]
+    data: NestedTensors
     """
     The tensor data of this field in
     [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
@@ -315,13 +315,8 @@ def reduce_data(
         if len(set(field_types)) > 1:
             raise ValueError(f"Cannot merge different {field_types=}")
 
-        validated_data = list[NestedTensors]()
-        for i, elem in enumerate(elems):
-            assert elem.data is not None, (
-                f"Cannot merge with empty `elems[{i}]`")
-            validated_data.append(elem.data)
-
-        return self._reduce_data(validated_data, pin_memory=pin_memory)
+        batch = [elem.data for elem in elems]
+        return self._reduce_data(batch, pin_memory=pin_memory)
 
 
 @dataclass(frozen=True)
@@ -643,6 +638,17 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
     """
 
+    @staticmethod
+    def dummy(modality: str):
+        """Convenience class for testing."""
+        mm_elem = MultiModalFieldElem(
+            modality=modality,
+            key="dummy",
+            data=torch.empty(1),
+            field=MultiModalSharedField(1),
+        )
+        return MultiModalKwargsItem.from_elems([mm_elem])
+
     @staticmethod
     def from_elems(elems: Sequence[MultiModalFieldElem]):
         return MultiModalKwargsItem({elem.key: elem for elem in elems})
@@ -654,46 +660,12 @@ def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None:
         assert len(modalities) == 1, f"Found different modalities={modalities}"
         self._modality = next(iter(modalities))
 
-        self._is_empty = any(elem.data is None for elem in self.values())
-
     @property
     def modality(self) -> str:
         return self._modality
 
-    @property
-    def is_empty(self) -> bool:
-        return self._is_empty
-
-    def get_data(self) -> Optional[Mapping[str, NestedTensors]]:
-        if self._is_empty:
-            return None
-
-        out_data = dict[str, NestedTensors]()
-        for key, elem in self.items():
-            assert elem.data is not None, (
-                f"Cannot get data of empty `elem[{key!r}]`")
-            out_data[key] = elem.data
-
-        return out_data
-
-    def require_data(self) -> Mapping[str, NestedTensors]:
-        if (data := self.get_data()) is None:
-            raise RuntimeError("Cannot get data of empty item")
-
-        return data
-
-    # These methods create a new item to avoid mutating cached items in place
-    def with_data(self, data: Mapping[str, NestedTensors]):
-        return MultiModalKwargsItem({
-            key: replace(elem, data=data[key])
-            for key, elem in self.items()
-        })
-
-    def without_data(self):
-        return MultiModalKwargsItem({
-            key: replace(elem, data=None)
-            for key, elem in self.items()
-        })
+    def get_data(self) -> Mapping[str, NestedTensors]:
+        return {key: elem.data for key, elem in self.items()}
 
 
 # NOTE: UserDict is for V0 compatibility.
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index b29394f3e676..f7ec982db41b 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,6 +3,7 @@
 
 import enum
 import time
+from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import msgspec
@@ -47,7 +48,7 @@ class EngineCoreRequest(
 
     request_id: str
     prompt_token_ids: list[int]
-    mm_kwargs: Optional[list[MultiModalKwargsItem]]
+    mm_kwargs: Optional[Sequence[Optional[MultiModalKwargsItem]]]
     mm_hashes: Optional[list[str]]
     mm_placeholders: Optional[list[PlaceholderRange]]
     sampling_params: Optional[SamplingParams]
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 1fed74330f0e..aa7dc62fd4ac 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Mapping
-from typing import TYPE_CHECKING
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional
 
 from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.multimodal.inputs import MultiModalKwargsItem, NestedTensors
+from vllm.multimodal.inputs import MultiModalKwargsItem
+from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -58,21 +59,21 @@ def __init__(self, model_config: "ModelConfig",
 
     def get_and_update(
         self,
-        mm_kwargs: list[MultiModalKwargsItem],
+        mm_kwargs: Sequence[MultiModalKwargsItem],
         mm_hashes: list[str],
-    ) -> list[MultiModalKwargsItem]:
+    ) -> list[Optional[MultiModalKwargsItem]]:
         if not self.enabled:
-            return mm_kwargs
+            return list(mm_kwargs)
 
         assert len(mm_kwargs) == len(mm_hashes)
 
-        out_mm_items = list[MultiModalKwargsItem]()
+        out_mm_items = list[Optional[MultiModalKwargsItem]]()
         for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
             if self.mm_cache.get(mm_hash) is not None:
-                out_mm_items.append(mm_item.without_data())
+                out_mm_items.append(None)
             else:
                 self.mm_cache[mm_hash] = \
-                    MultiModalCacheItemMetadata.wraps(mm_item.require_data())
+                    MultiModalCacheItemMetadata.wraps(mm_item)
                 out_mm_items.append(mm_item)
 
         return out_mm_items
@@ -91,25 +92,27 @@ def __init__(self, model_config: "ModelConfig",
         self.enabled = mm_registry.enable_mm_input_cache(model_config)
         self.mm_cache = MultiModalCache.get_lru_cache(
             model_config.get_mm_input_cache_gb(),
-            Mapping[str, NestedTensors],
+            MultiModalKwargsItem,
         )
 
     def get_and_update(
         self,
-        mm_kwargs: list[MultiModalKwargsItem],
+        mm_kwargs: Sequence[Optional[MultiModalKwargsItem]],
         mm_hashes: list[str],
     ) -> list[MultiModalKwargsItem]:
         if not self.enabled:
-            return mm_kwargs
+            mm_kwargs_lst = list(mm_kwargs)
+            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem)
+            return mm_kwargs_lst
 
         assert len(mm_kwargs) == len(mm_hashes)
 
         out_mm_items = list[MultiModalKwargsItem]()
         for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
-            if (mm_data := mm_item.get_data()) is None:
-                out_mm_items.append(mm_item.with_data(self.mm_cache[mm_hash]))
+            if mm_item is None:
+                out_mm_items.append(self.mm_cache[mm_hash])
             else:
-                self.mm_cache[mm_hash] = mm_data
+                self.mm_cache[mm_hash] = mm_item
                 out_mm_items.append(mm_item)
 
         return out_mm_items
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 376c76a7e728..c6a23cdbf65a 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -17,6 +17,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from vllm.utils import is_list_of
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
@@ -295,7 +296,7 @@ def process_inputs(
             pooling_params = params.clone()
 
         # Multimodal related.
-        sorted_mm_inputs: Optional[list[MultiModalKwargsItem]] = None
+        sorted_mm_inputs: Optional[list[Optional[MultiModalKwargsItem]]] = None
         sorted_mm_positions: Optional[list[PlaceholderRange]] = None
         sorted_mm_hashes: Optional[list[str]] = None
         if decoder_inputs["type"] == "multimodal":
@@ -308,7 +309,7 @@ def process_inputs(
             # in the input sequence.
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
-            sorted_mm_inputs = [
+            orig_sorted_mm_inputs = [
                 decoder_mm_inputs.get_item(modality, idx)
                 for modality, idx in sorted_mm_idxs
             ]
@@ -323,9 +324,12 @@ def process_inputs(
 
             if sorted_mm_hashes is not None:
                 sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
-                    sorted_mm_inputs,
+                    orig_sorted_mm_inputs,
                     sorted_mm_hashes,
                 )
+            else:
+                assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem)
+                sorted_mm_inputs = orig_sorted_mm_inputs
 
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 562925bde669..8b703b6191fe 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -125,14 +125,17 @@ def from_engine_core_request(
         block_hasher: Optional[Callable[["Request"], list["BlockHash"]]]
     ) -> "Request":
         if request.mm_kwargs is not None:
-            assert is_list_of(request.mm_kwargs, MultiModalKwargsItem), (
+            mm_kwargs_lst = list(request.mm_kwargs)
+            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem), (
                 "mm_kwargs was not updated in EngineCore.add_request")
+        else:
+            mm_kwargs_lst = None
 
         return cls(
             request_id=request.request_id,
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
-            multi_modal_kwargs=request.mm_kwargs,
+            multi_modal_kwargs=mm_kwargs_lst,
             multi_modal_hashes=request.mm_hashes,
             multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4c919b392fbd..5ee44a82574c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -500,8 +500,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 second_per_grid_ts = []
                 audio_feature_lengths = []
                 use_audio_in_video = False
-                for item in self.requests[req_id].mm_kwargs:
-                    mm_input = item.require_data()
+                for mm_item in self.requests[req_id].mm_kwargs:
+                    mm_input = mm_item.get_data()
                     if mm_input.get("image_grid_thw") is not None:
                         image_grid_thw.append(
                             mm_input["image_grid_thw"].tolist())

From a7ec9e01822d077b4b1e5d7c6649aba6127e86cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= <wangzhipeng628@gmail.com>
Date: Sat, 16 Aug 2025 20:16:58 +0800
Subject: [PATCH 135/233] [New Model]mBART model (#22883)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
---
 docs/models/supported_models.md               |   4 +
 examples/offline_inference/encoder_decoder.py | 233 +++++----
 .../models/language/generation/test_mbart.py  | 123 +++++
 tests/models/registry.py                      |   2 +
 vllm/model_executor/models/bart.py            | 444 +++++++++++++++++-
 vllm/model_executor/models/registry.py        |   1 +
 6 files changed, 716 insertions(+), 91 deletions(-)
 create mode 100644 tests/models/language/generation/test_mbart.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a24fa4bcce33..a514572945c3 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -330,6 +330,7 @@ th {
 | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ |
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
+| `MBartForConditionalGeneration` | mBART | `facebook/mbart-large-en-ro`, `facebook/mbart-large-50`, etc. | | | |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
@@ -418,6 +419,9 @@ Some models are supported only via the [Transformers backend](#transformers). Th
 !!! note
     Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
 
+!!! note
+    Some mBART models' config files do not have an `architecture` defined. Therefore, you need to use `--hf-overrides '{"architectures": ["MBartForConditionalGeneration"]}'` to explicitly specify the use of the `MBartForConditionalGeneration` architecture.
+
 ### Pooling Models
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 0da6fa5c4af5..df6c1eaf4a21 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -2,9 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Demonstrate prompting of text-to-text
-encoder/decoder models, specifically BART
+encoder/decoder models, specifically BART and mBART.
+
+This script is refactored to allow model selection via command-line arguments.
 """
 
+import argparse
+from typing import NamedTuple, Optional
+
 from vllm import LLM, SamplingParams
 from vllm.inputs import (
     ExplicitEncoderDecoderPrompt,
@@ -14,119 +19,175 @@
 )
 
 
-def create_prompts(tokenizer):
-    # Test prompts
-    #
-    # This section shows all of the valid ways to prompt an
-    # encoder/decoder model.
-    #
-    # - Helpers for building prompts
-    text_prompt_raw = "Hello, my name is"
-    text_prompt = TextPrompt(prompt="The president of the United States is")
-    tokens_prompt = TokensPrompt(
-        prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
+class ModelRequestData(NamedTuple):
+    """
+    Holds the configuration for a specific model, including its
+    HuggingFace ID and the prompts to use for the demo.
+    """
+
+    model_id: str
+    encoder_prompts: list
+    decoder_prompts: list
+    hf_overrides: Optional[dict] = None
+
+
+def get_bart_config() -> ModelRequestData:
+    """
+    Returns the configuration for facebook/bart-large-cnn.
+    This uses the exact test cases from the original script.
+    """
+    encoder_prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "An encoder prompt",
+    ]
+    decoder_prompts = [
+        "A decoder prompt",
+        "Another decoder prompt",
+    ]
+    return ModelRequestData(
+        model_id="facebook/bart-large-cnn",
+        encoder_prompts=encoder_prompts,
+        decoder_prompts=decoder_prompts,
     )
-    # - Pass a single prompt to encoder/decoder model
-    #   (implicitly encoder input prompt);
-    #   decoder input prompt is assumed to be None
-
-    single_text_prompt_raw = text_prompt_raw  # Pass a string directly
-    single_text_prompt = text_prompt  # Pass a TextPrompt
-    single_tokens_prompt = tokens_prompt  # Pass a TokensPrompt
-
-    # ruff: noqa: E501
-    # - Pass explicit encoder and decoder input prompts within one data structure.
-    #   Encoder and decoder prompts can both independently be text or tokens, with
-    #   no requirement that they be the same prompt type. Some example prompt-type
-    #   combinations are shown below, note that these are not exhaustive.
-
-    enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
-        # Pass encoder prompt string directly, &
-        # pass decoder prompt tokens
-        encoder_prompt=single_text_prompt_raw,
-        decoder_prompt=single_tokens_prompt,
-    )
-    enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
-        # Pass TextPrompt to encoder, and
-        # pass decoder prompt string directly
-        encoder_prompt=single_text_prompt,
-        decoder_prompt=single_text_prompt_raw,
+
+
+def get_mbart_config() -> ModelRequestData:
+    """
+    Returns the configuration for facebook/mbart-large-en-ro.
+    This uses prompts suitable for an English-to-Romanian translation task.
+    """
+    encoder_prompts = [
+        "The quick brown fox jumps over the lazy dog.",
+        "How are you today?",
+    ]
+    decoder_prompts = ["", ""]
+    hf_overrides = {"architectures": ["MBartForConditionalGeneration"]}
+    return ModelRequestData(
+        model_id="facebook/mbart-large-en-ro",
+        encoder_prompts=encoder_prompts,
+        decoder_prompts=decoder_prompts,
+        hf_overrides=hf_overrides,
     )
-    enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
-        # Pass encoder prompt tokens directly, and
-        # pass TextPrompt to decoder
-        encoder_prompt=single_tokens_prompt,
-        decoder_prompt=single_text_prompt,
+
+
+MODEL_GETTERS = {
+    "bart": get_bart_config,
+    "mbart": get_mbart_config,
+}
+
+
+def create_all_prompt_types(
+    encoder_prompts_raw: list,
+    decoder_prompts_raw: list,
+    tokenizer,
+) -> list:
+    """
+    Generates a list of diverse prompt types for demonstration.
+    This function is generic and uses the provided raw prompts
+    to create various vLLM input objects.
+    """
+    text_prompt_raw = encoder_prompts_raw[0]
+    text_prompt = TextPrompt(prompt=encoder_prompts_raw[1 % len(encoder_prompts_raw)])
+    tokens_prompt = TokensPrompt(
+        prompt_token_ids=tokenizer.encode(
+            encoder_prompts_raw[2 % len(encoder_prompts_raw)]
+        )
     )
 
-    # - Finally, here's a useful helper function for zipping encoder and
-    #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
-    #   instances
+    decoder_tokens_prompt = TokensPrompt(
+        prompt_token_ids=tokenizer.encode(decoder_prompts_raw[0])
+    )
+    single_prompt_examples = [
+        text_prompt_raw,
+        text_prompt,
+        tokens_prompt,
+    ]
+    explicit_pair_examples = [
+        ExplicitEncoderDecoderPrompt(
+            encoder_prompt=text_prompt_raw,
+            decoder_prompt=decoder_tokens_prompt,
+        ),
+        ExplicitEncoderDecoderPrompt(
+            encoder_prompt=text_prompt,
+            decoder_prompt=decoder_prompts_raw[1 % len(decoder_prompts_raw)],
+        ),
+        ExplicitEncoderDecoderPrompt(
+            encoder_prompt=tokens_prompt,
+            decoder_prompt=text_prompt,
+        ),
+    ]
     zipped_prompt_list = zip_enc_dec_prompts(
-        ["An encoder prompt", "Another encoder prompt"],
-        ["A decoder prompt", "Another decoder prompt"],
+        encoder_prompts_raw,
+        decoder_prompts_raw,
     )
+    return single_prompt_examples + explicit_pair_examples + zipped_prompt_list
 
-    # - Let's put all of the above example prompts together into one list
-    #   which we will pass to the encoder/decoder LLM.
-    return [
-        single_text_prompt_raw,
-        single_text_prompt,
-        single_tokens_prompt,
-        enc_dec_prompt1,
-        enc_dec_prompt2,
-        enc_dec_prompt3,
-    ] + zipped_prompt_list
 
-
-# Create a sampling params object.
-def create_sampling_params():
+def create_sampling_params() -> SamplingParams:
+    """Create a sampling params object."""
     return SamplingParams(
         temperature=0,
         top_p=1.0,
         min_tokens=0,
-        max_tokens=20,
+        max_tokens=30,
     )
 
 
-# Print the outputs.
-def print_outputs(outputs):
-    print("-" * 50)
+def print_outputs(outputs: list):
+    """Formats and prints the generation outputs."""
+    print("-" * 80)
     for i, output in enumerate(outputs):
         prompt = output.prompt
         encoder_prompt = output.encoder_prompt
         generated_text = output.outputs[0].text
         print(f"Output {i + 1}:")
-        print(
-            f"Encoder prompt: {encoder_prompt!r}\n"
-            f"Decoder prompt: {prompt!r}\n"
-            f"Generated text: {generated_text!r}"
+        print(f"Encoder Prompt: {encoder_prompt!r}")
+        print(f"Decoder Prompt: {prompt!r}")
+        print(f"Generated Text: {generated_text!r}")
+        print("-" * 80)
+
+
+def main(args):
+    """Main execution function."""
+    model_key = args.model
+    if model_key not in MODEL_GETTERS:
+        raise ValueError(
+            f"Unknown model: {model_key}. "
+            f"Available models: {list(MODEL_GETTERS.keys())}"
         )
-        print("-" * 50)
-
-
-def main():
-    dtype = "float"
+    config_getter = MODEL_GETTERS[model_key]
+    model_config = config_getter()
 
-    # Create a BART encoder/decoder model instance
+    print(f"🚀 Running demo for model: {model_config.model_id}")
     llm = LLM(
-        model="facebook/bart-large-cnn",
-        dtype=dtype,
+        model=model_config.model_id,
+        dtype="float",
+        hf_overrides=model_config.hf_overrides,
     )
-
-    # Get BART tokenizer
     tokenizer = llm.llm_engine.get_tokenizer_group()
-
-    prompts = create_prompts(tokenizer)
+    prompts = create_all_prompt_types(
+        encoder_prompts_raw=model_config.encoder_prompts,
+        decoder_prompts_raw=model_config.decoder_prompts,
+        tokenizer=tokenizer,
+    )
     sampling_params = create_sampling_params()
-
-    # Generate output tokens from the prompts. The output is a list of
-    # RequestOutput objects that contain the prompt, generated
-    # text, and other information.
     outputs = llm.generate(prompts, sampling_params)
-
     print_outputs(outputs)
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(
+        description="A flexible demo for vLLM encoder-decoder models."
+    )
+    parser.add_argument(
+        "--model",
+        "-m",
+        type=str,
+        default="bart",
+        choices=MODEL_GETTERS.keys(),
+        help="The short name of the model to run.",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/tests/models/language/generation/test_mbart.py b/tests/models/language/generation/test_mbart.py
new file mode 100644
index 000000000000..854a72713943
--- /dev/null
+++ b/tests/models/language/generation/test_mbart.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import pytest
+from transformers import AutoModelForSeq2SeqLM
+
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import DecoderPromptType, HfRunner, VllmRunner
+from ...utils import check_logprobs_close
+
+
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
+    decoder_prompt_type: DecoderPromptType,
+):
+    """Sanitize vllm output to be comparable with hf output."""
+    output_ids, output_str, out_logprobs = vllm_output
+    hf_output_str = output_str + "</s>"
+    return output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    prompts: list[dict[str, str]],
+    decoder_prompt_type: DecoderPromptType,
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    '''
+    Test the vLLM mBART model by validating it against HuggingFace (HF).
+    (Docstring content is omitted for brevity)
+    '''
+
+    vllm_prompts = prompts
+    if decoder_prompt_type == DecoderPromptType.NONE:
+        vllm_prompts = [{
+            "encoder_prompt": p['encoder_prompt'],
+            "decoder_prompt": ""
+        } for p in prompts]
+
+    vllm_kwargs = {
+        "hf_overrides": {
+            "architectures": ["MBartForConditionalGeneration"]
+        }
+    }
+
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True,
+                     **vllm_kwargs) as vllm_model:  # type: ignore
+        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+            vllm_prompts, max_tokens, num_logprobs)
+
+    hf_kwargs = {
+        "top_k": None,
+        "num_beams": 1,
+        "repetition_penalty": 1.0,
+        "top_p": 1.0,
+        "length_penalty": 1.0,
+        "early_stopping": False,
+        "no_repeat_ngram_size": None,
+        "min_length": 0
+    }
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+        hf_kwargs["decoder_start_token_id"] = (
+            hf_model.tokenizer.lang_code_to_id["ro_RO"])
+
+        hf_outputs = (
+            hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                prompts,  # HF runner still uses the original prompts
+                max_tokens,
+                num_logprobs,
+                **hf_kwargs,
+            ))
+
+    hf_skip_tokens = 0
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=[
+            vllm_to_hf_output(vllm_output, decoder_prompt_type)
+            for vllm_output in vllm_outputs
+        ],
+        name_0="hf",
+        name_1="vllm",
+        num_outputs_0_skip_tokens=hf_skip_tokens,
+    )
+
+
+@pytest.mark.parametrize(
+    "model",
+    [pytest.param("facebook/mbart-large-en-ro")],
+)
+@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
+                dtype, max_tokens, num_logprobs, decoder_prompt_type) -> None:
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        example_encoder_decoder_prompts[decoder_prompt_type],
+        decoder_prompt_type,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 10e29e01e8a1..99cf997790fe 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -316,6 +316,8 @@ def check_available_online(
     # [Encoder-decoder]
     "BartModel": _HfExamplesInfo("facebook/bart-base"),
     "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
+    "MBartForConditionalGeneration": _HfExamplesInfo("facebook/mbart-large-en-ro",  # noqa: E501
+                                                    hf_overrides={"architectures": ["MBartForConditionalGeneration"]}),  # noqa: E501
 }
 
 _EMBEDDING_EXAMPLE_MODELS = {
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 3d328c88ff6e..32551d8102f3 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -46,7 +46,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsQuant, SupportsV0Only
-from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
+                    maybe_prefix)
 
 logger = logging.get_logger(__name__)
 
@@ -422,10 +423,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if hidden_states.dtype == torch.float16 and (
                 torch.isinf(hidden_states).any()
                 or torch.isnan(hidden_states).any()):
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states,
-                                        min=-clamp_value,
-                                        max=clamp_value)
+            hidden_states = cast_overflow_tensors(hidden_states)
 
         return hidden_states
 
@@ -906,3 +904,439 @@ def load_weights(self, weights: Iterable[tuple[str,
             })
 
         return loaded_params
+
+
+class MBartEncoderLayer(BartEncoderLayer):
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            hidden_states
+                torch.Tensor of *encoder* input embeddings.
+        Returns:
+            Encoder layer output torch.Tensor
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        fc1_out, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(fc1_out)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+                torch.isinf(hidden_states).any()
+                or torch.isnan(hidden_states).any()):
+            hidden_states = cast_overflow_tensors(hidden_states)
+
+        return hidden_states
+
+
+class MBartDecoderLayer(BartDecoderLayer):
+
+    def forward(
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        residual = decoder_hidden_states
+        hidden_states = self.self_attn_layer_norm(decoder_hidden_states)
+
+        # Self Attention
+        hidden_states = self.self_attn(hidden_states=hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+
+        residual = hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        hidden_states = self.encoder_attn(
+            decoder_hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        fc1_out, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(fc1_out)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class MBartEncoder(nn.Module):
+    """
+    Transformer encoder consisting of *config.encoder_layers*
+    self attention layers. Each layer is a [`BartEncoderLayer`].
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self,
+                 config: BartConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 lora_config: Optional[LoRAConfig] = None,
+                 embed_tokens: Optional[nn.Embedding] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+        embed_dim = config.d_model
+        self.max_source_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
+                                                    embed_dim,
+                                                    embed_scale=embed_scale)
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([
+            MBartEncoderLayer(config,
+                              cache_config,
+                              quant_config,
+                              prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(config.encoder_layers)
+        ])
+
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.layer_norm = nn.LayerNorm(config.d_model)  # 改动
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                Indices of *encoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            positions
+                Positions of *encoder* input sequence tokens.
+        Returns:
+            Decoder output torch.Tensor
+        """
+        # retrieve input_ids and inputs_embeds
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        embed_pos = self.embed_positions(positions)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states=hidden_states)
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class MBartDecoder(nn.Module):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers.
+    Each layer is a [`BartDecoderLayer`]
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(
+        self,
+        config: BartConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        lora_config: Optional[LoRAConfig] = None,
+        embed_tokens: Optional[nn.Embedding] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.cache_config = cache_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = BartScaledWordEmbedding(config.vocab_size,
+                                                    config.d_model,
+                                                    embed_scale=embed_scale)
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+
+        self.layers = nn.ModuleList(
+            [MBartDecoderLayer(config, cache_config, quant_config,
+                               prefix=f"{prefix}.layers.{layer_idx}") \
+             for layer_idx in range(config.decoder_layers)])
+
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        decoder_input_ids: torch.Tensor,
+        decoder_positions: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            decoder_input_ids
+                Indices of *decoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            decoder_positions
+                Positions of *decoder* input sequence tokens.
+            encoder_hidden_states:
+                Tensor of encoder output embeddings
+        Returns:
+            Decoder output torch.Tensor
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(decoder_input_ids)
+        else:
+            decoder_positions = inputs_embeds[:, -1]
+
+        # embed positions
+        embed_pos = self.embed_positions(decoder_positions)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        # decoder layers
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                decoder_hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class MBartModel(nn.Module, SupportsQuant):
+    _tied_weights_keys = [
+        "encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
+    ]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.encoder = MBartEncoder(config,
+                                    cache_config,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.encoder")
+        self.decoder = MBartDecoder(config,
+                                    cache_config,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.decoder")
+
+    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+                encoder_input_ids: torch.Tensor,
+                encoder_positions: torch.Tensor) -> torch.Tensor:
+        r"""
+        Args:
+            input_ids
+                Indices of *decoder* input sequence tokens in the vocabulary.
+                Padding will be ignored by default should you
+                provide it.
+            positions
+                Positions of *decoder* input sequence tokens.
+            encoder_input_ids
+                Indices of *encoder* input sequence tokens in the vocabulary.
+            encoder_positions:
+                Positions of *encoder* input sequence tokens.
+        Returns:
+            Model output torch.Tensor
+        """
+
+        encoder_hidden_states = None
+
+        if encoder_input_ids.numel() > 0:
+            # Run encoder attention if a non-zero number of encoder tokens
+            # are provided as input
+            encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
+                                                 positions=encoder_positions)
+
+        # decoder outputs consists of
+        # (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            decoder_input_ids=input_ids,
+            decoder_positions=positions,
+            encoder_hidden_states=encoder_hidden_states)
+
+        return decoder_outputs
+
+
+class MBartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
+    base_model_prefix = "model"
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "decoder.": "model.decoder.",
+            "encoder.": "model.encoder.",
+            "shared.": "model.shared."
+        },
+        orig_to_new_substr={
+            "beta": "bias",
+            "gamma": "weight",
+            "LayerNorm": "layernorm",
+        },
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        lora_config = vllm_config.lora_config
+        assert config.tie_word_embeddings
+        self.config = config
+        self.model = MBartModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
+
+        self.lm_head = BartParallelLMHead(config.vocab_size,
+                                          config.d_model,
+                                          embed_scale=embed_scale)
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        *,
+        encoder_input_ids: torch.Tensor,
+        encoder_positions: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.model(input_ids, positions, encoder_input_ids,
+                          encoder_positions)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        model_params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        remaining_weights = []
+        shared_embedding_weight = None
+
+        for name, loaded_weight in weights:
+            if any(skip in name
+                   for skip in ["cls.", "pooler.", "final_logits_bias"]):
+                continue
+            if any(embed_name in name for embed_name in [
+                    'shared.weight', 'encoder.embed_tokens.weight',
+                    'decoder.embed_tokens.weight'
+            ]):
+                if shared_embedding_weight is None:
+                    shared_embedding_weight = loaded_weight
+                continue
+            is_stacked = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                vllm_name = name
+                for src, dst in self.hf_to_vllm_mapper.orig_to_new_substr.items(
+                ):
+                    vllm_name = vllm_name.replace(src, dst)
+                for src, dst in self.hf_to_vllm_mapper.orig_to_new_prefix.items(
+                ):
+                    if vllm_name.startswith(src):
+                        vllm_name = dst + vllm_name[len(src):]
+                        break
+                vllm_name = vllm_name.replace(weight_name, param_name)
+                if vllm_name in model_params_dict:
+                    param = model_params_dict[vllm_name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight, shard_id)
+                    loaded_params.add(vllm_name)
+                is_stacked = True
+                break
+            if not is_stacked:
+                remaining_weights.append((name, loaded_weight))
+        loader = AutoWeightsLoader(self, skip_prefixes=["cls.", "pooler."])
+        auto_loaded_params = loader.load_weights(remaining_weights,
+                                                 mapper=self.hf_to_vllm_mapper)
+        loaded_params.update(auto_loaded_params)
+        if shared_embedding_weight is not None:
+            lm_head_param = self.lm_head.weight
+            weight_loader = getattr(lm_head_param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(lm_head_param, shared_embedding_weight)
+            self.model.encoder.embed_tokens.weight = self.lm_head.weight
+            self.model.decoder.embed_tokens.weight = self.lm_head.weight
+            loaded_params.update({
+                'model.encoder.embed_tokens.weight', 'lm_head.weight',
+                'model.decoder.embed_tokens.weight'
+            })
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b817615b4356..109bc1fe5c77 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -141,6 +141,7 @@
     # [Encoder-decoder]
     "BartModel": ("bart", "BartForConditionalGeneration"),
     "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
+    "MBartForConditionalGeneration": ("bart", "MBartForConditionalGeneration"),
 }
 
 _EMBEDDING_MODELS = {

From 37b545910a29b36769b150d80672c1eb50c04689 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Sat, 16 Aug 2025 14:36:30 -0300
Subject: [PATCH 136/233] Fix handling of `max_num_batched_tokens` for pooling
 tasks (#23004)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/config/__init__.py  |  3 ---
 vllm/engine/arg_utils.py | 10 +++++-----
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 72fec5e205e3..14fc5589a89a 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3600,9 +3600,6 @@ def __post_init__(self):
                 logger.info(reason)
             self.scheduler_config.chunked_prefill_enabled = False
             self.scheduler_config.long_prefill_token_threshold = 0
-            self.scheduler_config.max_num_batched_tokens = max(
-                self.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
             if self.cache_config is not None:
                 self.cache_config.enable_prefix_caching = False
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f8af6d36e0c0..630fbec4539e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1602,9 +1602,6 @@ def _set_default_args_v1(self, usage_context: UsageContext,
                 self.enable_prefix_caching = incremental_prefill_supported
                 logger.info("(%s) prefix caching by default", action)
 
-        if not self.enable_chunked_prefill:
-            self.max_num_batched_tokens = model_config.max_model_len
-
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:
@@ -1692,8 +1689,11 @@ def _set_default_args_v1(self, usage_context: UsageContext,
                     self.max_num_batched_tokens = \
                         default_max_num_batched_tokens[usage_context]
             else:
-                self.max_num_batched_tokens = default_max_num_batched_tokens[
-                    usage_context]
+                if not self.enable_chunked_prefill:
+                    self.max_num_batched_tokens = model_config.max_model_len
+                else:
+                    self.max_num_batched_tokens = \
+                        default_max_num_batched_tokens[usage_context]
             logger.debug(
                 "Setting max_num_batched_tokens to %d for %s usage context.",
                 self.max_num_batched_tokens, use_context_value)

From 0addb6c4f0949d804284c9fdde92e8be6cc42526 Mon Sep 17 00:00:00 2001
From: Woonggi Min <kali2005611@gmail.com>
Date: Sun, 17 Aug 2025 02:38:42 +0900
Subject: [PATCH 137/233] [Frontend] Added support for HermesToolParser for
 models without special tokens (#16890)

Signed-off-by: minpeter <kali2005611@gmail.com>
---
 .../tool_parsers/test_hermes_tool_parser.py   | 127 ++++++++++++++++++
 .../openai/tool_parsers/hermes_tool_parser.py |  81 ++++++++---
 2 files changed, 191 insertions(+), 17 deletions(-)
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py

diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
new file mode 100644
index 000000000000..28b1f8358d80
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from ....utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+LORA_MODEL = "minpeter/LoRA-Llama-3.2-1B-tool-vllm-ci"
+
+SERVER_ARGS = [
+    "--enforce-eager",
+    "--enable-auto-tool-choice",
+    "--tool-call-parser",
+    "hermes",
+    "--enable-lora",
+    "--lora-modules",
+    f"{LORA_MODEL}={LORA_MODEL}",
+]
+
+TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description":
+                    "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {
+                    "type": "string",
+                    "enum": ["celsius", "fahrenheit"]
+                },
+            },
+            "required": ["location"],
+        },
+    },
+}]
+
+MESSAGES = [{"role": "user", "content": "What's the weather like in Boston?"}]
+
+
+@pytest.mark.asyncio
+async def test_non_streaming_tool_call():
+    """Test tool call in non-streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        response = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=MESSAGES,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+        )
+
+        assert response.choices
+        choice = response.choices[0]
+        message = choice.message
+
+        assert choice.finish_reason == "tool_calls"
+        assert message.tool_calls is not None
+
+        tool_call = message.tool_calls[0]
+        assert tool_call.type == "function"
+        assert tool_call.function.name == "get_current_weather"
+
+        arguments = json.loads(tool_call.function.arguments)
+        assert "location" in arguments
+        assert "Boston" in arguments["location"]
+        print("\n[Non-Streaming Test Passed]")
+        print(f"Tool Call: {tool_call.function.name}")
+        print(f"Arguments: {arguments}")
+
+
+@pytest.mark.asyncio
+async def test_streaming_tool_call():
+    """Test tool call in streaming mode."""
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as server:
+        client = server.get_async_client()
+
+        stream = await client.chat.completions.create(
+            model=LORA_MODEL,
+            messages=MESSAGES,
+            tools=TOOLS,
+            tool_choice="auto",
+            temperature=0.0,
+            stream=True,
+        )
+
+        tool_call_chunks = {}
+        async for chunk in stream:
+            if not chunk.choices:
+                continue
+
+            delta = chunk.choices[0].delta
+            if not delta or not delta.tool_calls:
+                continue
+
+            for tool_chunk in delta.tool_calls:
+                index = tool_chunk.index
+                if index not in tool_call_chunks:
+                    tool_call_chunks[index] = {"name": "", "arguments": ""}
+
+                if tool_chunk.function.name:
+                    tool_call_chunks[index]["name"] += tool_chunk.function.name
+                if tool_chunk.function.arguments:
+                    tool_call_chunks[index][
+                        "arguments"] += tool_chunk.function.arguments
+
+        assert len(tool_call_chunks) == 1
+        reconstructed_tool_call = tool_call_chunks[0]
+
+        assert reconstructed_tool_call["name"] == "get_current_weather"
+
+        arguments = json.loads(reconstructed_tool_call["arguments"])
+        assert "location" in arguments
+        assert "Boston" in arguments["location"]
+        print("\n[Streaming Test Passed]")
+        print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
+        print(f"Reconstructed Arguments: {arguments}")
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index c7030d34d453..d126130ab9bc 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -52,14 +52,51 @@ def __init__(self, tokenizer: AnyTokenizer):
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
                 "constructor during construction.")
-        self.tool_call_start_token_id = self.vocab.get(
-            self.tool_call_start_token)
-        self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
-        if (self.tool_call_start_token_id is None
-                or self.tool_call_end_token_id is None):
-            raise RuntimeError(
-                "Hermes 2 Pro Tool parser could not locate tool call start/end "
-                "tokens in the tokenizer!")
+        self.tool_call_start_token_ids = self.model_tokenizer.encode(
+            self.tool_call_start_token, add_special_tokens=False)
+        self.tool_call_end_token_ids = self.model_tokenizer.encode(
+            self.tool_call_end_token, add_special_tokens=False)
+
+        self.tool_call_start_token_array = [
+            self.model_tokenizer.decode([token_id])
+            for token_id in self.tool_call_start_token_ids
+        ]
+
+        self.tool_call_end_token_array = [
+            self.model_tokenizer.decode([token_id])
+            for token_id in self.tool_call_end_token_ids
+        ]
+
+        self.buffered_delta_text = ""
+
+    # Very simple idea: when encountering tokens like <, tool, _call, >,
+    # <, /, tool, _call, >, store them in a buffer.
+    # When the last token is encountered, empty the buffer and return it.
+    # If a token appears in an incorrect sequence while storing in the buffer,
+    # return the preceding buffer along with the token.
+    def tool_call_delta_buffer(self, delta_text: str):
+        # If the sequence of tool_call_start or tool_call_end tokens is not yet
+        # complete, fill the buffer with the token and return "".
+        if (delta_text in self.tool_call_start_token_array
+                or delta_text in self.tool_call_end_token_array):
+            # If delta_text is the last token of tool_call_start_token or
+            # tool_call_end_token, empty the buffer and return
+            # the buffered text + delta_text.
+            if (delta_text == self.tool_call_start_token_array[-1]
+                    or delta_text == self.tool_call_end_token_array[-1]):
+                buffered_text = self.buffered_delta_text
+                self.buffered_delta_text = ""
+                return buffered_text + delta_text
+            else:
+                self.buffered_delta_text = self.buffered_delta_text + delta_text
+                return ""
+        else:
+            if self.buffered_delta_text:
+                buffered_text = self.buffered_delta_text
+                self.buffered_delta_text = ""
+                return buffered_text + delta_text
+            else:
+                return delta_text
 
     def extract_tool_calls(
         self,
@@ -124,11 +161,23 @@ def extract_tool_calls_streaming(
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
     ) -> Union[DeltaMessage, None]:
+        # 1. All tokens are parsed based on _text, not token_ids.
+        # 2. All incoming text data is processed by the tool_call_delta_buffer
+        #    function for buffering before being used for parsing.
+
+        delta_text = self.tool_call_delta_buffer(delta_text)
+        # If the last characters of previous_text
+        # match self.buffered_delta_text, remove only the matching part.
+        if (len(previous_text) >= len(self.buffered_delta_text)
+                and previous_text[-len(self.buffered_delta_text):]
+                == self.buffered_delta_text):
+            previous_text = previous_text[:-len(self.buffered_delta_text)]
+            current_text = previous_text + delta_text
 
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
         # check to see if we should be streaming a tool call - is there a
-        if self.tool_call_start_token_id not in current_token_ids:
+        if self.tool_call_start_token not in current_text:
             logger.debug("No tool call tokens found!")
             return DeltaMessage(content=delta_text)
 
@@ -136,14 +185,12 @@ def extract_tool_calls_streaming(
 
             # figure out where we are in the parsing by counting tool call
             # start & end tags
-            prev_tool_start_count = previous_token_ids.count(
-                self.tool_call_start_token_id)
-            prev_tool_end_count = previous_token_ids.count(
-                self.tool_call_end_token_id)
-            cur_tool_start_count = current_token_ids.count(
-                self.tool_call_start_token_id)
-            cur_tool_end_count = current_token_ids.count(
-                self.tool_call_end_token_id)
+            prev_tool_start_count = previous_text.count(
+                self.tool_call_start_token)
+            prev_tool_end_count = previous_text.count(self.tool_call_end_token)
+            cur_tool_start_count = current_text.count(
+                self.tool_call_start_token)
+            cur_tool_end_count = current_text.count(self.tool_call_end_token)
             tool_call_portion = None
             text_portion = None
 

From 111f5ee1756c5a0632821acfc5681ad00f5e3658 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 16 Aug 2025 14:16:00 -0400
Subject: [PATCH 138/233] [Bugfix gpt-oss] Fix float32 convert for flashinfer
 sink support (#23016)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/attention/layer.py                  | 9 +++++++++
 vllm/v1/attention/backends/flashinfer.py | 3 ---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 1a9c0e26b53c..0e87fa3f23e3 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -308,6 +308,15 @@ def process_weights_after_loading(self, act_dtype: torch.dtype):
         if hasattr(self.impl, "process_weights_after_loading"):
             self.impl.process_weights_after_loading(act_dtype)
 
+        # FlashInfer requires attention sinks to be float32
+        if (self.backend == _Backend.FLASHINFER_VLLM_V1
+                and hasattr(self.impl, 'sinks')):
+            from vllm.v1.attention.backends.flashinfer import FlashInferImpl
+            assert isinstance(self.impl, FlashInferImpl)
+            if (self.impl.sinks is not None
+                    and self.impl.sinks.dtype != torch.float32):
+                self.impl.sinks = self.impl.sinks.to(torch.float32)
+
     def get_attn_backend(self) -> type[AttentionBackend]:
         return self.attn_backend
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index eac3f33e1509..991904229fd7 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -642,9 +642,6 @@ def __init__(
                     f"heads in the layer. Expected {num_heads}, but got "
                     f"{sinks.shape[0]}."
                 )
-            # Cast sinks to float32 if needed (FlashInfer requirement)
-            if sinks.dtype != torch.float32:
-                sinks = sinks.to(torch.float32)
             self.sinks = sinks
 
     def forward(

From 98e93578355741c770c929207812f93f4aeb2708 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 16 Aug 2025 14:33:08 -0400
Subject: [PATCH 139/233] [Flaky CI] Increase timeout tolerance for
 test_mp_crash_detection+test_default_mm_lora_chat_completions (#23028)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/entrypoints/openai/test_default_mm_loras.py | 3 ++-
 tests/mq_llm_engine/test_error_handling.py        | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_default_mm_loras.py b/tests/entrypoints/openai/test_default_mm_loras.py
index 372e9b1fecd4..b9c466a6fbeb 100644
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@@ -48,7 +48,8 @@ def multimodal_server():  # noqa: F811
         f"{{\"audio\": \"{AUDIO_LORA_PATH}\"}}",
     ]
 
-    with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MULTIMODAL_MODEL_NAME, args,
+                            max_wait_seconds=480) as remote_server:
         yield remote_server
 
 
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 3feee01dadf7..77e3732cd06c 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -255,8 +255,8 @@ def mock_init():
             pass
         end = time.perf_counter()
 
-        assert end - start < 60, (
-            "Expected vLLM to gracefully shutdown in <60s "
+        assert end - start < 100, (
+            "Expected vLLM to gracefully shutdown in <100s "
             "if there is an error in the startup.")
 
 
From 417a255e8574d148496df8cc642f738c6898ad45 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 16 Aug 2025 15:38:21 -0400
Subject: [PATCH 140/233] [Kernel/Quant] Remove AQLM (#22943)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |   1 -
 CMakeLists.txt                                |   1 -
 benchmarks/kernels/benchmark_aqlm.py          | 345 ----------
 csrc/ops.h                                    |   9 -
 csrc/quantization/aqlm/gemm_kernels.cu        | 597 ------------------
 csrc/torch_bindings.cpp                       |  15 -
 .../quantization/supported_hardware.md        |   1 -
 docs/mkdocs/hooks/generate_examples.py        |   1 -
 examples/offline_inference/basic/README.md    |  14 -
 tests/compile/test_full_graph.py              |   4 -
 tests/kernels/quantization/test_aqlm.py       |  40 --
 tests/models/quantization/test_aqlm.py        |  68 --
 vllm/_custom_ops.py                           |  41 --
 vllm/model_executor/layers/linear.py          |  18 -
 .../layers/quantization/__init__.py           |   3 -
 .../layers/quantization/aqlm.py               | 376 -----------
 16 files changed, 1534 deletions(-)
 delete mode 100644 benchmarks/kernels/benchmark_aqlm.py
 delete mode 100644 csrc/quantization/aqlm/gemm_kernels.cu
 delete mode 100644 tests/kernels/quantization/test_aqlm.py
 delete mode 100644 tests/models/quantization/test_aqlm.py
 delete mode 100644 vllm/model_executor/layers/quantization/aqlm.py

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 5e5a532cb57d..df0bae0c9cbf 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -121,7 +121,6 @@ fi
 if [[ $commands == *" kernels/quantization"* ]]; then
   commands="${commands} \
   --ignore=kernels/quantization/test_int8_quant.py \
-  --ignore=kernels/quantization/test_aqlm.py \
   --ignore=kernels/quantization/test_machete_mm.py \
   --ignore=kernels/quantization/test_block_fp8.py \
   --ignore=kernels/quantization/test_block_int8.py \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cda1ffc795d1..34386d670ac7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -286,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   FetchContent_MakeAvailable(cutlass)
 
   list(APPEND VLLM_EXT_SRC
-    "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
deleted file mode 100644
index 42de062b08e4..000000000000
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import sys
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.aqlm import (
-    dequantize_weight,
-    generic_dequantize_gemm,
-    get_int_dtype,
-    optimized_dequantize_gemm,
-)
-from vllm.utils import FlexibleArgumentParser
-
-os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-
-
-def torch_mult(
-    # [..., in_features]
-    input: torch.Tensor,
-    weights: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-) -> torch.Tensor:
-    output = F.linear(input, weights)
-    return output
-
-
-def dequant_out_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    if bias is None:
-        output = F.linear(input, weights, bias)
-        orig_shape = output.shape
-        flattened_output = output.view(-1, output.size(-1))
-        f_scales = scales.view(-1, scales.shape[0])
-        b_scales = f_scales.expand(flattened_output.shape[0], -1)
-        flattened_output *= b_scales
-        return flattened_output.view(orig_shape)
-    else:
-        b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
-        weights *= b_scales
-        return F.linear(input, weights, bias)
-
-
-def dequant_weight_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
-    weights *= b_scales
-    return F.linear(input, weights, bias)
-
-
-def dequant_no_scale(
-    # [..., in_features]
-    input: torch.Tensor,
-    # [num_out_groups, num_in_groups, num_codebooks]
-    codes: torch.IntTensor,
-    # [num_codebooks, codebook_size, out_group_size, in_group_size]
-    codebooks: torch.Tensor,
-    # [num_out_groups, 1, 1, 1]
-    scales: torch.Tensor,
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    return F.linear(input, weights, bias)
-
-
-# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
-# the generic pytorch version.
-# Just visual comparison.
-def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
-    n = int(parts.sum().item())
-
-    device = torch.device("cuda:0")
-
-    code_range = (1 << bits) // 2
-    ingroups = 8
-
-    codes = torch.randint(
-        -code_range,
-        code_range,
-        size=(n, k // ingroups, nbooks),
-        dtype=get_int_dtype(bits),
-        device=device,
-    )
-
-    codebooks = torch.randn(
-        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-        dtype=torch.float16,
-        device=device,
-    )
-
-    count = 0
-    for index in range(16):
-        for i in range(8):
-            for book in range(nbooks):
-                codebooks[book, index, 0, i] = count * (10**book)
-            count += 1
-
-    print("codes shape", codes.shape)
-
-    for i in range(16):
-        for book in range(nbooks):
-            codes[0, i, book] = i
-            codes[0, -i, book] = i
-
-    weights = dequantize_weight(codes, codebooks, None)
-    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
-
-    print("weights shape:", weights.shape)
-    print("weights2 shape:", weights2.shape)
-
-    print("weights are:", weights)
-    print("weights2 are:", weights2)
-
-    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
-    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
-
-    print("last 128 weights are", weights[0, -128:])
-    print("last 128 weights2 are:", weights2[0, -128:])
-
-
-def main():
-    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
-
-    # Add arguments
-    parser.add_argument(
-        "--nbooks", type=int, default=1, help="Number of codebooks (default: 1)"
-    )
-    parser.add_argument(
-        "--bits",
-        type=int,
-        default=16,
-        help="Number of bits per code element (default: 16)",
-    )
-    parser.add_argument(
-        "--test",
-        type=bool,
-        default=False,
-        help="Run the decompression/dequant tester rather than benchmarking "
-        "(default: False)",
-    )
-
-    # Parse the arguments
-    args = parser.parse_args()
-
-    # Extract values
-    nbooks = args.nbooks
-    bits = args.bits
-
-    if args.test:
-        dequant_test(4096, torch.tensor((4096,)), nbooks, bits)
-        return
-
-    # Otherwise, benchmark.
-    methods = [
-        ops.aqlm_gemm,
-        dequant_out_scale,
-        generic_dequantize_gemm,
-        optimized_dequantize_gemm,
-        dequant_weight_scale,
-        torch_mult,
-        dequant_no_scale,
-    ]
-
-    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
-    print(f"writing benchmarks to file {filename}")
-    with open(filename, "w") as f:
-        sys.stdout = f
-
-        print("m | k | n | n parts", end="")
-        for method in methods:
-            print(f" | {method.__name__.replace('_', ' ')} (µs)", end="")
-        print("")
-
-        # These are reasonable prefill sizes.
-        ksandpartions = (
-            (4096, (4096, 4096, 4096)),
-            (4096, (4096,)),
-            (4096, (11008, 11008)),
-            (11008, (4096,)),
-        )
-
-        # reasonable ranges for m.
-        for m in [
-            1,
-            2,
-            4,
-            8,
-            10,
-            12,
-            14,
-            16,
-            24,
-            32,
-            48,
-            52,
-            56,
-            64,
-            96,
-            112,
-            128,
-            256,
-            512,
-            1024,
-            1536,
-            2048,
-            3072,
-            4096,
-        ]:
-            print(f"{m}", file=sys.__stdout__)
-            for ksp in ksandpartions:
-                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods)
-
-        sys.stdout = sys.__stdout__
-
-
-def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods):
-    # I didn't see visible improvements from increasing these, but feel free :)
-    num_warmup_trials = 1
-    num_trials = 1
-
-    num_calls = 100
-
-    # warmup.
-    for method in methods:
-        for _ in range(num_warmup_trials):
-            run_timing(
-                num_calls=num_calls,
-                m=m,
-                k=k,
-                parts=parts,
-                nbooks=nbooks,
-                bits=bits,
-                method=method,
-            )
-
-    n = parts.sum().item()
-    print(f"{m} | {k} | {n} | {parts.tolist()}", end="")
-
-    for method in methods:
-        best_time_us = 1e20
-        for _ in range(num_trials):
-            kernel_dur_ms = run_timing(
-                num_calls=num_calls,
-                m=m,
-                k=k,
-                parts=parts,
-                nbooks=nbooks,
-                bits=bits,
-                method=method,
-            )
-
-            kernel_dur_us = 1000 * kernel_dur_ms
-
-            if kernel_dur_us < best_time_us:
-                best_time_us = kernel_dur_us
-
-        print(f" | {kernel_dur_us:.0f}", end="")
-
-    print("")
-
-
-def run_timing(
-    num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method
-) -> float:
-    n = int(parts.sum().item())
-
-    device = torch.device("cuda:0")
-
-    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
-
-    code_range = (1 << bits) // 2
-    ingroups = 8
-
-    codes = torch.randint(
-        -code_range,
-        code_range,
-        size=(n, k // ingroups, nbooks),
-        dtype=get_int_dtype(bits),
-        device=device,
-    )
-
-    codebooks = torch.randn(
-        size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-        dtype=torch.float16,
-        device=device,
-    )
-
-    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
-
-    # for comparison to just a pytorch mult.
-    weights = torch.randn((n, k), dtype=torch.float16, device=device)
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    start_event.record()
-
-    if method is torch_mult:
-        for i in range(num_calls):
-            torch_mult(input, weights, scales)
-    else:
-        for i in range(num_calls):
-            method(input, codes, codebooks, scales, parts, None)
-
-    end_event.record()
-    end_event.synchronize()
-
-    dur_ms = start_event.elapsed_time(end_event) / num_calls
-    return dur_ms
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/csrc/ops.h b/csrc/ops.h
index 3e29f0a973dd..6e39758f16a1 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -154,15 +154,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
 torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
 
 #ifndef USE_ROCM
-torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
-                        const torch::Tensor& codebooks,
-                        const torch::Tensor& scales,
-                        const std::vector<int64_t>& codebook_partition_sizes,
-                        const std::optional<torch::Tensor>& bias);
-
-torch::Tensor aqlm_dequant(
-    const torch::Tensor& codes, const torch::Tensor& codebooks,
-    const std::vector<int64_t>& codebook_partition_sizes);
 
 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                        torch::Tensor _scaling_factors, torch::Tensor _zeros,
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
deleted file mode 100644
index 79cd2c610b3c..000000000000
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ /dev/null
@@ -1,597 +0,0 @@
-/*
- * Modified by Neural Magic
- * Adapted from https://github.com/Vahe1994/AQLM
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *         http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAStream.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <iostream>
-#include <cstdlib>
-
-namespace vllm {
-namespace aqlm {
-
-__global__ void Code1x16MatVec(
-    const int4* __restrict__ A, const int4* __restrict__ B,
-    int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m,
-    const int prob_k,
-    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
-                                  // codebook, at most 3 long.
-    const int codebook_stride     // as int4.
-) {
-  int a_gl_stride = prob_k / 8 / 8;
-  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  bool pred = a_gl_rd < prob_m;
-
-  if (pred) {
-    // advance to the correct codebook, this easy because we only multiply one
-    // column of the codebook.
-    auto codebook_size = &codebook_a_sizes.x;
-    while (a_gl_rd >= *codebook_size) {
-      codebook += codebook_stride;
-      ++codebook_size;
-    }
-  }
-
-  int b_gl_rd = 0;
-  int c_gl_wr = a_gl_rd;
-  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
-  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
-
-  __shared__ int4 sh_b[32 * 9];
-  float res = 0;
-
-  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
-  while (iters--) {
-    // We pad shared memory to avoid bank conflicts during reads
-    __syncthreads();
-    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
-      if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
-    }
-    __syncthreads();
-    b_gl_rd += 32 * 8;
-
-    int b_sh_rd = 9 * (threadIdx.x % 32);
-    if (pred && a_gl_rd < a_gl_end) {
-      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
-#pragma unroll
-      for (int i = 0; i < 8; i++) {
-        uint32_t dec[4];
-        // We bypass the L1 cache to avoid massive amounts of memory streaming
-        // that doesn't actually help us; this brings > 2x speedup.
-        asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-                     : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
-                     : "l"((void*)&codebook[enc[i]]));
-        half2* a = reinterpret_cast<half2*>(&dec);
-        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
-        half2 res2 = {};
-#pragma unroll
-        for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2);
-        res += __half2float(res2.x) + __half2float(res2.y);
-        b_sh_rd++;
-      }
-      a_gl_rd += 32;
-    }
-  }
-
-  if (pred) {
-#pragma unroll
-    for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
-    if (threadIdx.x % 32 == 0)
-      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
-  }
-}
-
-__global__ void Code2x8MatVec(
-    const int4* __restrict__ A, const int4* __restrict__ B,
-    int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m,
-    int prob_k,
-    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
-                                  // codebook, at most 3 long.
-    const int codebook_stride     // as int4.
-
-) {
-  int a_gl_stride = prob_k / 8 / 8;
-  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  bool pred = a_gl_rd < prob_m;
-
-  if (pred) {
-    // advance to the correct codebook, this easy because we only multiply one
-    // column of the codebook.
-    auto codebook_size = &codebook_a_sizes.x;
-    while (a_gl_rd >= *codebook_size) {
-      codebook += codebook_stride;
-      ++codebook_size;
-    }
-  }
-
-  int b_gl_rd = 0;
-  int c_gl_wr = a_gl_rd;
-  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
-  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
-  int lane = threadIdx.x % 8;
-
-  extern __shared__ int4 sh[];
-  int4* sh_b = sh;
-  int4* sh_code = sh_b + 32 * 9;
-  int4* sh_code0 = sh_code;
-  int4* sh_code1 = sh_code + 256 * 8;
-
-  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
-    int4 dec = codebook[i];
-#pragma unroll
-    for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
-  }
-  __syncthreads();
-
-  float res = 0;
-
-  int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
-  while (iters--) {
-    // We pad shared memory to avoid bank conflicts during reads
-    __syncthreads();
-    for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
-      if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
-    }
-    __syncthreads();
-    b_gl_rd += 32 * 8;
-
-    int b_sh_rd = 9 * (threadIdx.x % 32);
-    if (pred && a_gl_rd < a_gl_end) {
-      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
-#pragma unroll
-      for (int i = 0; i < 8; i++) {
-        half2* a0 =
-            reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
-        half2* a1 =
-            reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
-        half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
-        half2 res2 = {};
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-          res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
-        res += __half2float(res2.x) + __half2float(res2.y);
-        b_sh_rd++;
-      }
-      a_gl_rd += 32;
-    }
-  }
-
-  if (pred) {
-#pragma unroll
-    for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
-    if (threadIdx.x % 32 == 0)
-      reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
-  }
-}
-
-__global__ void Code1x16Dequant(
-    const int4* __restrict__ A, int4* __restrict__ C,
-    const int4* __restrict__ codebook, int prob_m, int prob_k,
-    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
-                                  // codebook, at most 3 long, sums to m.
-    const int codebook_stride     // as int4
-) {
-  int a_gl_stride = prob_k / 8 / 8;
-  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  bool pred = a_gl_rd < prob_m;
-
-  if (pred) {
-    // advance to the correct codebook, this easy because we only multiply one
-    // column of the codebook.
-    auto codebook_size = &codebook_a_sizes.x;
-    while (a_gl_rd >= *codebook_size) {
-      codebook += codebook_stride;
-      ++codebook_size;
-    }
-  }
-
-  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
-  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
-
-  int c_gl_stride = prob_k / 8;
-  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
-
-  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
-  while (iters--) {
-    if (pred && a_gl_rd < a_gl_end) {
-      const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
-#pragma unroll
-      for (int i = 0; i < 8; i++) {
-        int4 chunk;
-        auto dec = reinterpret_cast<uint32_t*>(&chunk);
-        // We bypass the L1 cache to avoid massive amounts of memory streaming
-        // that doesn't actually help us; this brings > 2x speedup.
-        asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
-                     : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
-                     : "l"((void*)&codebook[enc[i]]));
-
-        C[a_gl_rd * 8 + i] = chunk;
-      }
-    }
-    a_gl_rd += 32;
-  }
-}
-
-__global__ void Code2x8Dequant(
-    const int4* __restrict__ A, int4* __restrict__ C,
-    const int4* __restrict__ codebook, int prob_m, int prob_k,
-    const int4
-        codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
-                           // most 3 long, corresponds to cols.
-    const int codebook_stride  // as int4
-) {
-  int a_gl_stride = prob_k / 8 / 8;
-  int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  bool pred = a_gl_rd < prob_m;
-
-  if (pred) {
-    // advance to the correct codebook, this easy because we only multiply one
-    // column of the codebook.
-    auto codebook_size = &codebook_a_sizes.x;
-    while (a_gl_rd >= *codebook_size) {
-      codebook += codebook_stride;
-      ++codebook_size;
-    }
-  }
-
-  a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
-  int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
-  int lane = threadIdx.x % 8;
-
-  int c_gl_stride = prob_k / 8;
-  int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
-  c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
-
-  extern __shared__ int4 sh[];
-  int4* sh_code = sh;
-  int4* sh_code0 = sh_code;
-  int4* sh_code1 = sh_code + 256 * 8;
-
-  for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
-    int4 dec = codebook[i];
-#pragma unroll
-    for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
-  }
-  __syncthreads();
-
-  int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
-  while (iters--) {
-    if (pred && a_gl_rd < a_gl_end) {
-      const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
-#pragma unroll
-      for (int i = 0; i < 8; i++) {
-        int4 chunk;
-        half2* a0 =
-            reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
-        half2* a1 =
-            reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-          reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
-        C[a_gl_rd * 8 + i] = chunk;
-      }
-    }
-    a_gl_rd += 32;
-  }
-}
-
-inline int ceildiv(int a, int b) { return (a + b - 1) / b; }
-
-const int THREAD_M = 16;
-
-void code1x16_matvec_cuda(const void* __restrict__ A,
-                          const void* __restrict__ B, void* __restrict__ C,
-                          const void* __restrict__ codebook, int prob_m,
-                          int prob_k, const int4 codebook_a_sizes,
-                          const int codebook_stride) {
-  int sms;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
-  int waves = 0;
-  int thread_m;
-  do {
-    waves++;
-    thread_m = ceildiv(prob_m, waves * sms);
-  } while (thread_m > THREAD_M);
-
-  int blocks = ceildiv(prob_m, thread_m);
-  int threads = 32 * thread_m;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  Code1x16MatVec<<<blocks, threads, 16 * 32 * 9, stream>>>(
-      (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
-      prob_k, codebook_a_sizes, codebook_stride);
-}
-
-void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B,
-                         void* __restrict__ C,
-                         const void* __restrict__ codebook, int prob_m,
-                         int prob_k, const int4 codebook_a_sizes,
-                         const int codebook_stride) {
-  int sms;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
-  int waves = 0;
-  int thread_m;
-  do {
-    waves++;
-    thread_m = ceildiv(prob_m, waves * sms);
-  } while (thread_m > THREAD_M);
-
-  int blocks = ceildiv(prob_m, thread_m);
-  int threads = 32 * thread_m;
-  int shared = 16 * (2 * 256 * 8 + 32 * 9);
-  cudaFuncSetAttribute(Code2x8MatVec,
-                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  Code2x8MatVec<<<blocks, threads, shared, stream>>>(
-      (const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
-      prob_k, codebook_a_sizes, codebook_stride);
-}
-
-void code1x16_dequant_cuda(
-    const void* __restrict__ A, void* __restrict__ C,
-    const void* __restrict__ codebook, int prob_m, int prob_k,
-    const int4 codebook_a_sizes,  // cumulative sizes of A spanning each
-                                  // codebook, at most 3 long.
-    const int codebook_stride     // as int4.
-) {
-  int sms;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
-  int waves = 0;
-  int thread_m;
-  do {
-    waves++;
-    thread_m = ceildiv(prob_m, waves * sms);
-  } while (thread_m > THREAD_M);
-
-  int blocks = ceildiv(prob_m, thread_m);
-  int threads = 32 * thread_m;
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  Code1x16Dequant<<<blocks, threads, 0, stream>>>(
-      (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
-      codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
-                         // most 3 long.
-      codebook_stride    // as int4.
-  );
-}
-
-// Dequantizes the code and codebook into weights.
-void code2x8_dequant_cuda(
-    const void* __restrict__ A, void* __restrict__ C,
-    const void* __restrict__ codebook, int prob_m, int prob_k,
-    const int4
-        codebook_a_sizes,  // cumulative sizes of A spanning each codebook, at
-                           // most 3 long, corresponds to cols.
-    const int codebook_stride  // as int4
-) {
-  int sms;
-  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
-  int waves = 0;
-  int thread_m;
-  do {
-    waves++;
-    thread_m = ceildiv(prob_m, waves * sms);
-  } while (thread_m > THREAD_M);
-
-  int blocks = ceildiv(prob_m, thread_m);
-  int threads = 32 * thread_m;
-  int shared = 16 * (2 * 256 * 8 + 32 * 9);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-
-  cudaFuncSetAttribute(Code2x8Dequant,
-                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
-  Code2x8Dequant<<<blocks, threads, shared, stream>>>(
-      (const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
-      codebook_a_sizes, codebook_stride);
-}
-
-int codebook_stride(const torch::Tensor& codebooks) {
-  return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
-}
-
-void code1x16_matvec(
-    const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C,
-    const torch::Tensor& codebook,
-    const int4 codebook_a_sizes  // cumulative sizes of A spanning each
-                                 // codebook, at most 3 long.
-) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
-  int prob_m = C.size(0);
-  int prob_k = B.size(0);
-
-  code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
-                       codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
-                       codebook_stride(codebook));
-}
-
-torch::Tensor code1x16_matmat(const torch::Tensor& input,
-                              const torch::Tensor& codes,
-                              const torch::Tensor& codebooks,
-                              const torch::Tensor& scales,
-                              const int4 codebook_a_sizes,
-                              const std::optional<torch::Tensor>& bias) {
-  auto input_sizes = input.sizes();
-  auto out_features = codes.size(0) * codebooks.size(2);
-  auto flat_input = input.reshape({-1, input.size(-1)});
-  auto flat_output = torch::empty(
-      {flat_input.size(0), out_features},
-      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
-
-  for (int i = 0; i < flat_input.size(0); ++i) {
-    auto input_vec = flat_input.index({i});
-    auto output_vec = flat_output.index({i});
-    code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
-                    codebook_a_sizes);
-  }
-  flat_output *= scales.flatten().unsqueeze(0);
-
-  if (bias.has_value()) {
-    flat_output += bias->unsqueeze(0);
-  }
-
-  auto output_sizes = input_sizes.vec();
-  output_sizes.pop_back();
-  output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes);
-  return output;
-}
-
-void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B,
-                    torch::Tensor& C, const torch::Tensor& codebook,
-                    const int4 codebook_a_sizes) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
-  int prob_m = C.size(0);
-  int prob_k = B.size(0);
-  code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
-                      codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
-                      2 * codebook_stride(codebook));
-}
-
-torch::Tensor code2x8_matmat(const torch::Tensor& input,
-                             const torch::Tensor& codes,
-                             const torch::Tensor& codebooks,
-                             const torch::Tensor& scales,
-                             const int4 codebook_a_sizes,
-                             const std::optional<torch::Tensor>& bias) {
-  auto input_sizes = input.sizes();
-  auto out_features = codes.size(0) * codebooks.size(2);
-  auto flat_input = input.reshape({-1, input.size(-1)});
-  auto flat_output = torch::empty(
-      {flat_input.size(0), out_features},
-      torch::TensorOptions().dtype(input.dtype()).device(input.device()));
-
-  for (int i = 0; i < flat_input.size(0); ++i) {
-    auto input_vec = flat_input.index({i});
-    auto output_vec = flat_output.index({i});
-    code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
-                   codebook_a_sizes);
-  }
-  flat_output *= scales.flatten().unsqueeze(0);
-  if (bias.has_value()) {
-    flat_output += bias->unsqueeze(0);
-  }
-
-  auto output_sizes = input_sizes.vec();
-  output_sizes.pop_back();
-  output_sizes.push_back(-1);
-  auto output = flat_output.reshape(output_sizes);
-  return output;
-}
-
-// Accumulate the partition sizes.
-int4 accumulate_sizes(const std::vector<int64_t>& codebook_partition_sizes) {
-  int4 cumulative_sizes;
-  auto cumulative_size = &cumulative_sizes.x;
-  size_t i = 0;
-  int last = 0;
-  assert(codebook_partition_sizes.size() <= 4);
-  for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) {
-    *cumulative_size = codebook_partition_sizes[i] + last;
-    last = *cumulative_size;
-  }
-  // fill in the rest with unreachable.
-  for (; i < 4; ++i, ++cumulative_size) {
-    *cumulative_size = last * 10;
-  }
-  return cumulative_sizes;
-}
-
-}  // namespace aqlm
-}  // namespace vllm
-
-torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
-                        const torch::Tensor& codebooks,
-                        const torch::Tensor& scales,
-                        const std::vector<int64_t>& codebook_partition_sizes,
-                        const std::optional<torch::Tensor>& bias) {
-  int4 cumulative_sizes =
-      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
-
-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
-  int const entries = codebooks.size(1);
-
-  if (nbooks == 1 && entries == (1 << 16)) {
-    return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales,
-                                       cumulative_sizes, bias);
-  }
-  if (nbooks == 2 && entries == (1 << 8)) {
-    return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales,
-                                      cumulative_sizes, bias);
-  }
-
-  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
-              " entries is not currently supported.")
-  return {};
-}
-
-torch::Tensor aqlm_dequant(
-    const torch::Tensor& codes, const torch::Tensor& codebooks,
-    const std::vector<int64_t>& codebook_partition_sizes) {
-  int4 cumulative_sizes =
-      vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
-
-  int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
-  int const entries = codebooks.size(1);
-
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
-  int rows = codes.size(1);
-  int cols = codes.size(0);
-
-  auto in_features = codes.size(1) * 8;
-  auto out_features = codes.size(0);
-
-  assert(out_features == std::accumulate(codebook_partition_sizes.begin(),
-                                         codebook_partition_sizes.end(), 0));
-
-  auto weights = torch::empty({out_features, in_features},
-                              torch::TensorOptions()
-                                  .dtype(codebooks.dtype())
-                                  .device(codebooks.device()));
-
-  if (nbooks == 1 && entries == (1 << 16)) {
-    vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
-                                      codebooks.data_ptr(), out_features,
-                                      in_features, cumulative_sizes,
-                                      vllm::aqlm::codebook_stride(codebooks));
-
-    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower
-    // and not consistent with gemv implementation.) weights *=
-    // scales.index({"...", 0, 0});
-
-    return weights;
-  }
-
-  if (nbooks == 2 && entries == (1 << 8)) {
-    vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
-                                     codebooks.data_ptr(), out_features,
-                                     in_features, cumulative_sizes,
-                                     vllm::aqlm::codebook_stride(codebooks));
-
-    // if you wanted to flip to scaling the weights, (though it's 30%-ish slower
-    // and not consistent with gemv implementation) weights *=
-    // scales.index({"...", 0, 0});
-
-    return weights;
-  }
-
-  TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
-              " entries is not currently supported.")
-  return {};
-}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index a547baec50d6..5fee106335d3 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -207,21 +207,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Quantization ops
 #ifndef USE_ROCM
-  // Quantized GEMM for AQLM.
-  ops.def(
-      "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
-      "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
-      "-> Tensor",
-      {stride_tag});
-  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
-
-  // Decompression method for AQLM.
-  ops.def(
-      "aqlm_dequant(Tensor codes, Tensor codebooks, "
-      "int[] codebook_partition_sizes) -> Tensor",
-      {stride_tag});
-  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
-
   // Quantized GEMM for AWQ.
   ops.def(
       "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
index f53e69ecc611..06264d08b56a 100644
--- a/docs/features/quantization/supported_hardware.md
+++ b/docs/features/quantization/supported_hardware.md
@@ -17,7 +17,6 @@ th {
 | INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
 | FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
 | BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
 | GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
index 6b4c5b31075f..1e8b848db46d 100644
--- a/docs/mkdocs/hooks/generate_examples.py
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -24,7 +24,6 @@ def fix_case(text: str) -> str:
         "llm": "LLM",
         "mae": "MAE",
         "tpu": "TPU",
-        "aqlm": "AQLM",
         "gguf": "GGUF",
         "lora": "LoRA",
         "rlhf": "RLHF",
diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md
index 0a2bd6e2b70b..cbb3116e9741 100644
--- a/examples/offline_inference/basic/README.md
+++ b/examples/offline_inference/basic/README.md
@@ -52,20 +52,6 @@ Try it yourself with the following argument:
 
 ### Quantization
 
-#### AQLM
-
-vLLM supports models that are quantized using AQLM.
-
-Try one yourself by passing one of the following models to the `--model` argument:
-
-- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
-- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
-- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
-- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
-- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
-
-> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
-
 #### GGUF
 
 vLLM supports models that are quantized using GGUF.
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 72f962ed7484..a2fc6ffeb8b2 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -31,10 +31,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
     ]
 
     if all:
-        if is_quant_method_supported("aqlm"):
-            TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-                "quantization": "aqlm"
-            }))
 
         # TODO: figure out why this fails.
         if False and is_quant_method_supported("gguf"):  # noqa: SIM223
diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
deleted file mode 100644
index 427db3e60292..000000000000
--- a/tests/kernels/quantization/test_aqlm.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm import _custom_ops as ops  # noqa: F401
-
-
-def test_aqlm_dequant_opcheck():
-    codes = torch.randint(-32768,
-                          32767, (22016, 512, 1),
-                          device='cuda',
-                          dtype=torch.int16)
-    codebooks = torch.rand((2, 65536, 1, 8),
-                           device='cuda',
-                           dtype=torch.float16)
-    codebook_partition_sizes = [11008, 11008]
-
-    opcheck(torch.ops._C.aqlm_dequant,
-            (codes, codebooks, codebook_partition_sizes))
-
-
-def test_aqlm_gemm_opcheck():
-    input = torch.rand((4, 4096), device='cuda', dtype=torch.float16)
-    codes = torch.randint(-32768,
-                          32767, (12288, 512, 1),
-                          device='cuda',
-                          dtype=torch.int16)
-    codebooks = torch.rand((3, 65536, 1, 8),
-                           device='cuda',
-                           dtype=torch.float16)
-    scales = torch.rand((12288, 1, 1, 1), device='cuda', dtype=torch.float16)
-    codebook_partition_sizes = [4096, 4096, 4096]
-    bias = None
-
-    opcheck(torch.ops._C.aqlm_gemm,
-            (input, codes, codebooks, scales, codebook_partition_sizes, None))
-    opcheck(torch.ops._C.aqlm_gemm,
-            (input, codes, codebooks, scales, codebook_partition_sizes, bias))
diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py
deleted file mode 100644
index de6851e2fc28..000000000000
--- a/tests/models/quantization/test_aqlm.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm.platforms import current_platform
-
-# These ground truth generations were generated using `transformers==4.38.1
-# aqlm==1.1.0 torch==2.2.0`
-# and the below code:
-# ```python
-# from transformers import AutoTokenizer, AutoModelForCausalLM
-# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
-# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
-# torch_dtype="auto", device_map="cuda").cuda()
-# tokenizer = AutoTokenizer.from_pretrained(model_id)
-# outputs = []
-# for prompt in example_prompts:
-#     input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
-#     hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
-#     outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
-# print(outputs)
-# ```
-ground_truth_generations = [
-    '\n### Features\n\n- **High-throughput**: v',
-    'The major milestones in the development of artificial intelligence from '
-    '195',
-    'Compare and contrast artificial intelligence with human intelligence in '
-    'terms of processing information. The',
-    'Explain the difference between supervised and unsupervised learning.'
-    '\nExplain',
-    'Write a short story about a robot that dreams for the first time. The',
-    'Analyze the impact of the COVID-19 pandemic on global economic',
-    'The Mona Lisa is a painting by Leonardo da Vinci, and it',
-    'The early bird catches the worm.\nThe early bird catches the'
-]
-
-
-@pytest.mark.skipif(not is_quant_method_supported("aqlm")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="AQLM is not supported on this GPU type.")
-@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("num_logprobs", [1])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-    # loop through the prompts to compare against the ground truth generations
-    for prompt_idx in range(len(example_prompts)):
-        vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
-            prompt_idx]
-
-        print("Prompt:          ", repr(example_prompts[prompt_idx]))
-        print("Reference output:", repr(ground_truth_generations[prompt_idx]))
-        print("Output output:   ", repr(vllm_output_str))
-        assert vllm_output_str == ground_truth_generations[prompt_idx]
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a318637c5aeb..0d556053f898 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -476,32 +476,6 @@ def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor,
                            dtype=input.dtype,
                            device=input.device).sum(0)
 
-    @register_fake("_C::aqlm_gemm")
-    def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
-                        codebooks: torch.Tensor, scales: torch.Tensor,
-                        codebook_partition_sizes: list[int],
-                        bias: Optional[torch.Tensor]) -> torch.Tensor:
-        out_features = codes.size(0) * codebooks.size(2)
-        flat_input = input.reshape((-1, input.size(-1)))
-        flat_output = torch.empty((flat_input.size(0), out_features),
-                                  dtype=input.dtype,
-                                  device=input.device)
-
-        output_sizes = list(input.shape)
-        output_sizes.pop()
-        output_sizes.append(-1)
-        return flat_output.reshape(tuple(output_sizes))
-
-    @register_fake("_C::aqlm_dequant")
-    def _aqlm_dequant_fake(
-            codes: torch.Tensor, codebooks: torch.Tensor,
-            codebook_partition_sizes: list[int]) -> torch.Tensor:
-        in_features = codes.size(1) * 8
-        out_features = codes.size(0)
-        return torch.empty((out_features, in_features),
-                           dtype=codebooks.dtype,
-                           device=codebooks.device)
-
     @register_fake("_C::machete_mm")
     def machete_mm_fake(
         a: torch.Tensor,
@@ -957,21 +931,6 @@ def cutlass_fp4_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
                                              sf_offsets)
 
 
-# aqlm
-def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
-              codebooks: torch.Tensor, scales: torch.Tensor,
-              codebook_partition_sizes: list[int],
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
-                                  codebook_partition_sizes, bias)
-
-
-def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
-                 codebook_partition_sizes: list[int]) -> torch.Tensor:
-    return torch.ops._C.aqlm_dequant(codes, codebooks,
-                                     codebook_partition_sizes)
-
-
 # gptq_marlin
 def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                        size_k: int, size_n: int,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 75391c51f775..671ad9eed234 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -692,8 +692,6 @@ def weight_loader(self,
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
-        # Special case for AQLM codebooks.
-        is_metadata = getattr(param, "is_metadata", False)
         # Special case for per-tensor scale to load scalar into fused array.
         needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
 
@@ -781,13 +779,6 @@ def weight_loader(self,
             if not is_sharded_weight:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
-        # Special case for AQLM codebooks.
-        elif is_metadata:
-            # metadata indicates fixed size concatenated along dim 0
-            shard_size = loaded_weight.shape[0]
-            shard_offset = loaded_shard_id * shard_size
-            param_data = param_data.narrow(0, shard_offset, shard_size)
-
         # Special case for per-tensor scales in fused case.
         elif needs_scalar_to_array:
             param_data, loaded_weight = adjust_scalar_to_fused_array(
@@ -1081,8 +1072,6 @@ def weight_loader(self,
 
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
-        # Special case for AQLM codebooks.
-        is_metadata = getattr(param, "is_metadata", False)
 
         # Special case for per-tensor scales in fused case.
         needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
@@ -1204,13 +1193,6 @@ def weight_loader(self,
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
 
-        # Special case for for AQLM codebooks.
-        elif is_metadata:
-            # metadata indicates fixed size concatenated along dim 0
-            shard_size = loaded_weight.shape[0]
-            shard_index = ["q", "k", "v"].index(loaded_shard_id)
-            param_data = param_data.narrow(0, shard_index * shard_size,
-                                           shard_size)
         # Special case for per-tensor scales in fused case.
         elif needs_scalar_to_array:
             param_data, loaded_weight = adjust_scalar_to_fused_array(
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 8d63027e1863..a4c2671225f5 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -7,7 +7,6 @@
     QuantizationConfig)
 
 QuantizationMethods = Literal[
-    "aqlm",
     "awq",
     "deepspeedfp",
     "tpu_int8",
@@ -88,7 +87,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     # lazy import to avoid triggering `torch.compile` too early
     from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
 
-    from .aqlm import AQLMConfig
     from .auto_round import AutoRoundConfig
     from .awq import AWQConfig
     from .awq_marlin import AWQMarlinConfig
@@ -120,7 +118,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .tpu_int8 import Int8TpuConfig
 
     method_to_config: dict[str, type[QuantizationConfig]] = {
-        "aqlm": AQLMConfig,
         "awq": AWQConfig,
         "deepspeedfp": DeepSpeedFPConfig,
         "tpu_int8": Int8TpuConfig,
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
deleted file mode 100644
index 2ea8c5dc5113..000000000000
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Supports AQLM compression, see https://github.com/Vahe1994/AQLM
-# and https://arxiv.org/pdf/2401.06118.pdf
-
-import math
-from typing import Any, Optional
-
-import torch
-import torch.nn.functional as F
-from torch.nn.parameter import Parameter
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-from vllm.model_executor.utils import set_weight_attrs
-
-
-def get_int_dtype(nbits: int) -> torch.dtype:
-    if nbits <= 8:
-        return torch.int8
-    if nbits <= 16:
-        return torch.int16
-    if nbits <= 32:
-        return torch.int32
-    if nbits <= 64:
-        return torch.int64
-    raise ValueError(f"No dtype available for {nbits}-bit codebooks")
-
-
-@torch.inference_mode()
-def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
-    return data.to(torch.int64) % (2**nbits)
-
-
-def dequantize_weight(codes: torch.Tensor,
-                      codebooks: torch.Tensor,
-                      scales: Optional[torch.Tensor] = None) -> torch.Tensor:
-    """
-    Decode float weights from quantization codes. Differentiable.
-    :param codes: tensor of integer quantization codes, shape 
-        [*dims, num_out_groups, num_in_groups, num_codebooks]
-    :param codebooks: tensor of vectors for each quantization code, 
-        [num_codebooks, codebook_size, out_group_size, in_group_size]
-    :param scales: weight will be multiplied by this factor, must be 
-        broadcastble with 
-        [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
-    :return: reconstructed weight tensor of shape 
-        [*dims, num_in_groups*group_size]
-    """
-    num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
-    num_codebooks, codebook_size, out_group_size, in_group_size = \
-        codebooks.shape
-    out_features = num_out_groups * out_group_size
-    in_features = num_in_groups * in_group_size
-    codebook_offsets = torch.arange(
-        0, num_codebooks * codebook_size, codebook_size,
-        device=codes.device)  # shape: [num_codebooks]
-    reconstructed_weight_flat = F.embedding_bag(
-        codes.flatten(0, -2) + codebook_offsets,
-        codebooks.flatten(0, 1).flatten(-2, -1),
-        mode="sum"
-    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size
-    # * in_group_size]
-
-    reconstructed_weight_groupwise = reconstructed_weight_flat.view(
-        list(codes.shape[:-3]) +
-        [num_out_groups, num_in_groups, out_group_size, in_group_size])
-    if scales is not None:
-        reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
-            scales)
-    return reconstructed_weight_groupwise.swapaxes(
-        -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
-
-
-def dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    dequantized_weight = dequantize_weight(
-        unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
-        codebooks,
-        scales,
-    )
-    return F.linear(input, dequantized_weight, bias)
-
-
-# Generic dequantization, slow but flexible.
-def generic_dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: list[int],
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    output_shape = input.shape[:-1] + (scales.shape[0], )
-    output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
-    num_outputs = len(output_partition_sizes)
-
-    # break the inputs and codebooks apart then combine the outputs.
-    # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
-    # multiply at the end.
-    num_codebooks = codebooks.shape[0] // num_outputs
-    assert (scales.shape[0] == codes.shape[0])
-    assert (sum(output_partition_sizes) == scales.shape[0])
-    output_offset = 0
-    codebooks_offset = 0
-    for output_size in output_partition_sizes:
-        shard_output = dequantize_gemm(
-            input, codes.narrow(0, output_offset, output_size),
-            codebooks.narrow(0, codebooks_offset, num_codebooks),
-            scales.narrow(0, output_offset, output_size), None
-            if bias is None else bias.narrow(0, output_offset, output_size))
-
-        output_slice = output.narrow(-1, output_offset, output_size)
-        assert (output_slice.shape == shard_output.shape)
-        output_slice.copy_(shard_output)
-        output_offset += output_size
-        codebooks_offset += num_codebooks
-    return output
-
-
-# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
-# at 6 and 9 times faster than the generic version above, respectively.
-def optimized_dequantize_gemm(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: list[int],
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    if bias is None:
-        # scaling the output is fastest, so we do that when possible.
-        output = F.linear(input, weights, bias)
-        orig_shape = output.shape
-        flattened_output = output.view(-1, output.size(-1))
-        f_scales = scales.view(-1, scales.shape[0])
-        b_scales = f_scales.expand(flattened_output.shape[0], -1)
-        flattened_output *= b_scales
-        return output.view(orig_shape)
-    else:
-        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-            -1, weights.shape[1])
-        weights *= b_scales
-        return F.linear(input, weights, bias)
-
-
-class AQLMConfig(QuantizationConfig):
-    """Config class for AQLM.
-
-    Reference: https://github.com/Vahe1994/AQLM
-    """
-
-    def __init__(
-        self,
-        in_group_size: int,
-        nbits_per_codebook: int,
-        num_codebooks: int,
-        out_group_size: int,
-    ) -> None:
-        super().__init__()
-        self.in_group_size = in_group_size
-        self.nbits_per_codebook = nbits_per_codebook
-        self.num_codebooks = num_codebooks
-        self.out_group_size = out_group_size
-
-        # out_group_size > 1 is untested, and probably won't work as-is.
-        assert (self.out_group_size == 1)
-        self.pack_factor = (self.in_group_size * self.out_group_size)
-
-    def __repr__(self) -> str:
-        return (f"AQLMConfig(in_group_size={self.in_group_size}, "
-                f"nbits_per_codebook={self.nbits_per_codebook}, "
-                f"num_codebooks={self.num_codebooks}, "
-                f"out_group_size={self.out_group_size})")
-
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "aqlm"
-
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
-        return [torch.half]
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        return 60
-
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return []  # no extra configs.
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "AQLMConfig":
-        in_group_size = cls.get_from_keys(config, ["in_group_size"])
-        nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
-        num_code_books = cls.get_from_keys(config, ["num_codebooks"])
-        out_group_size = cls.get_from_keys(config, ["out_group_size"])
-        return cls(in_group_size, nbits_per_codebook, num_code_books,
-                   out_group_size)
-
-    def get_quant_method(self, layer: torch.nn.Module,
-                         prefix: str) -> Optional["AQLMLinearMethod"]:
-        if isinstance(layer, LinearBase):
-            return AQLMLinearMethod(self)
-        return None
-
-
-class AQLMLinearMethod(LinearMethodBase):
-    """Linear method for AQLM.
-
-    Args:
-        quant_config: The AQLM quantization config.
-    """
-
-    def __init__(self, quant_config: AQLMConfig):
-        self.quant_config = quant_config
-
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: list[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
-        del output_size  # Unused.
-        del input_size  # Unused.
-
-        if params_dtype != torch.half:
-            raise ValueError("Only half is currently supported by aqlm")
-        if input_size_per_partition % self.quant_config.in_group_size != 0:
-            raise ValueError(
-                "The input size is not aligned with the quantized "
-                "weight shape. This can be caused by too large "
-                "tensor parallel size.")
-
-        output_size_per_partition = sum(output_partition_sizes)
-        if output_size_per_partition % self.quant_config.out_group_size != 0:
-            raise ValueError(
-                "The output size is not aligned with the quantized "
-                "weight shape. This can be caused by too large "
-                "tensor parallel size.")
-
-        codes = Parameter(
-            torch.empty(
-                # There could actually be two pack factors, one along input and
-                # one along output, but we don't currently support
-                # out_group_size, and only the one along output needs to be
-                # marked with "packed_dim" in order for QKVLinear to work.
-                output_size_per_partition,
-                input_size_per_partition // self.quant_config.pack_factor,
-                self.quant_config.num_codebooks,
-                dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
-            ),
-            requires_grad=False,
-        )
-
-        set_weight_attrs(
-            codes,
-            {
-                "input_dim": 1,
-                "output_dim": 0,
-                "packed_dim": 1,
-                "pack_factor": self.quant_config.pack_factor,
-            },
-        )
-
-        codebooks = Parameter(
-            torch.empty(
-                self.quant_config.num_codebooks * len(output_partition_sizes),
-                2**self.quant_config.nbits_per_codebook,
-                self.quant_config.out_group_size,
-                self.quant_config.in_group_size,
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            codebooks,
-            {
-                # metadata indicates fixed size concatenated along dim 0
-                "is_metadata": True,
-                "output_partition_sizes": output_partition_sizes
-            },
-        )
-
-        scales = Parameter(
-            torch.empty(
-                (
-                    output_size_per_partition //
-                    self.quant_config.out_group_size,
-                    1,
-                    1,
-                    1,
-                ),
-                dtype=params_dtype,
-            ),
-            requires_grad=False,
-        )
-        set_weight_attrs(
-            scales,
-            {
-                "output_dim": 0,
-                "packed_dim": 0,
-                "pack_factor": self.quant_config.out_group_size
-            },
-        )
-
-        layer.register_parameter("codes", codes)
-        set_weight_attrs(codes, extra_weight_attrs)
-        layer.register_parameter("codebooks", codebooks)
-        set_weight_attrs(codebooks, extra_weight_attrs)
-        layer.register_parameter("scales", scales)
-        set_weight_attrs(scales, extra_weight_attrs)
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        codebooks = layer.codebooks
-        codes = layer.codes
-        scales = layer.scales
-        output_partition_sizes = getattr(codebooks, "output_partition_sizes",
-                                         [])
-
-        nbooks = codes.shape[2]
-        ingroups = codebooks.shape[3]
-        outgroups = codebooks.shape[2]
-        bits = codebooks.shape[1]
-
-        # We support these formats with dedicated gemm and decompression
-        # kernels.
-        if ingroups == 8 and outgroups == 1 and (
-            (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
-
-            # thresholds determined by timings on an A6000, one GPU
-            use_gemv = math.prod(x.shape[:-1]) <= 6
-
-            return ops.aqlm_gemm(
-                x,
-                codes,
-                codebooks,
-                scales,
-                output_partition_sizes,
-                bias,
-            ) if use_gemv else optimized_dequantize_gemm(
-                x,
-                codes,
-                codebooks,
-                scales,
-                output_partition_sizes,
-                bias,
-            )
-
-        # fall back all unoptimized formats
-        return generic_dequantize_gemm(
-            x,
-            codes,
-            codebooks,
-            scales,
-            output_partition_sizes,
-            bias,
-        )

From 8ff4603c07ac12a920949ccb67eb7dec45266f77 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Sat, 16 Aug 2025 15:59:17 -0400
Subject: [PATCH 141/233] [V1] Logits processors extensibility (#19912)

Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Andrew Feldman <afeld2012@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 .../offline_inference/logits_processor.py     | 147 +++++++++
 tests/utils.py                                |  79 ++++-
 tests/v1/logits_processors/__init__.py        |   0
 .../test_correctness.py}                      |  24 +-
 .../logits_processors/test_custom_offline.py  | 237 ++++++++++++++
 .../logits_processors/test_custom_online.py   | 180 +++++++++++
 tests/v1/logits_processors/utils.py           | 127 ++++++++
 tests/v1/sample/test_rejection_sampler.py     |   4 +-
 tests/v1/sample/test_sampler.py               |   4 +-
 tests/v1/worker/test_gpu_input_batch.py       |   4 +-
 vllm/config/__init__.py                       |   5 +
 vllm/engine/arg_utils.py                      |   8 +
 vllm/entrypoints/llm.py                       |   4 +
 vllm/utils/__init__.py                        |   2 +-
 vllm/v1/sample/logits_processor/__init__.py   | 185 +++++++++++
 .../builtin.py}                               | 296 ++----------------
 vllm/v1/sample/logits_processor/interface.py  |  86 +++++
 vllm/v1/sample/logits_processor/state.py      | 149 +++++++++
 vllm/v1/sample/metadata.py                    |   4 +-
 vllm/v1/worker/gpu_input_batch.py             |  91 ++++--
 vllm/v1/worker/gpu_model_runner.py            |  11 +-
 22 files changed, 1313 insertions(+), 335 deletions(-)
 create mode 100644 examples/offline_inference/logits_processor.py
 create mode 100644 tests/v1/logits_processors/__init__.py
 rename tests/v1/{sample/test_logits_processors.py => logits_processors/test_correctness.py} (97%)
 create mode 100644 tests/v1/logits_processors/test_custom_offline.py
 create mode 100644 tests/v1/logits_processors/test_custom_online.py
 create mode 100644 tests/v1/logits_processors/utils.py
 create mode 100644 vllm/v1/sample/logits_processor/__init__.py
 rename vllm/v1/sample/{logits_processor.py => logits_processor/builtin.py} (54%)
 create mode 100644 vllm/v1/sample/logits_processor/interface.py
 create mode 100644 vllm/v1/sample/logits_processor/state.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 87296a08e207..4fc885785492 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -253,6 +253,7 @@ steps:
     - pytest -v -s v1/engine
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
     - pytest -v -s v1/spec_decode
diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
new file mode 100644
index 000000000000..7ef20efa7d28
--- /dev/null
+++ b/examples/offline_inference/logits_processor.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates instantiating vLLM with a custom logits processor
+class object.
+
+For a basic example of implementing a custom logits processor, see
+the `DummyLogitsProcessor` implementation in `vllm/test_utils.py`.
+
+For testing purposes, a dummy logits processor is employed which, if
+`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
+will mask out all tokens except `target_token`.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect the `target_token` to be decoded in each step, yielding an output
+similar to that shown below:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+"""
+
+from typing import Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import VllmConfig
+from vllm.v1.sample.logits_processor import (
+    BatchUpdate,
+    LogitsProcessor,
+    MoveDirectionality,
+)
+
+
+# Hypothetical custom logits processor
+class DummyLogitsProcessor(LogitsProcessor):
+    """Fake logit processor to support unit testing and examples"""
+
+    def __init__(
+        self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
+    ):
+        self.req_info: dict[int, SamplingParams] = {}
+
+    def is_argmax_invariant(self) -> bool:
+        """Never impacts greedy sampling"""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        for index, params, _, _ in batch_update.added:
+            assert params is not None
+            if params.extra_args and (
+                target_token := params.extra_args.get("target_token")
+            ):
+                self.req_info[index] = target_token
+
+        if self.req_info:
+            # Process removed requests.
+            for index in batch_update.removed:
+                self.req_info.pop(index, None)
+
+            # Process moved requests, unidirectional move (a->b) and swap
+            # (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                a_val = self.req_info.pop(adx, None)
+                b_val = self.req_info.pop(bdx, None)
+                if a_val is not None:
+                    self.req_info[bdx] = a_val
+                if direct == MoveDirectionality.SWAP and b_val is not None:
+                    self.req_info[adx] = b_val
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.req_info:
+            return logits
+
+        # Save target values before modification
+        rows_list = list(self.req_info.keys())
+        cols = torch.tensor(
+            [self.req_info[i] for i in rows_list],
+            dtype=torch.long,
+            device=logits.device,
+        )
+        rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device)
+        values_to_keep = logits[rows, cols].clone()
+
+        # Mask all but target tokens
+        logits[rows] = float("-inf")
+        logits[rows, cols] = values_to_keep
+
+        return logits
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[DummyLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/utils.py b/tests/utils.py
index 18fcde949160..e98707fb4447 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -13,6 +13,7 @@
 import time
 import warnings
 from contextlib import contextmanager, suppress
+from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Callable, Literal, Optional, Union
 
@@ -76,6 +77,23 @@ def _nvml():
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
 
+    def _start_server(self, model: str, vllm_serve_args: list[str],
+                      env_dict: Optional[dict[str, str]]) -> None:
+        """Subclasses override this method to customize server process launch
+        """
+        env = os.environ.copy()
+        # the current process might initialize cuda,
+        # to be safe, we should use spawn method
+        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        if env_dict is not None:
+            env.update(env_dict)
+        self.proc: subprocess.Popen = subprocess.Popen(
+            ["vllm", "serve", model, *vllm_serve_args],
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+
     def __init__(self,
                  model: str,
                  vllm_serve_args: list[str],
@@ -128,18 +146,7 @@ def __init__(self,
             model_loader = get_model_loader(load_config)
             model_loader.download_model(model_config)
 
-        env = os.environ.copy()
-        # the current process might initialize cuda,
-        # to be safe, we should use spawn method
-        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
-        if env_dict is not None:
-            env.update(env_dict)
-        self.proc = subprocess.Popen(
-            ["vllm", "serve", model, *vllm_serve_args],
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-        )
+        self._start_server(model, vllm_serve_args, env_dict)
         max_wait_seconds = max_wait_seconds or 240
         self._wait_for_server(url=self.url_for("health"),
                               timeout=max_wait_seconds)
@@ -155,6 +162,10 @@ def __exit__(self, exc_type, exc_value, traceback):
             # force kill if needed
             self.proc.kill()
 
+    def _poll(self) -> Optional[int]:
+        """Subclasses override this method to customize process polling"""
+        return self.proc.poll()
+
     def _wait_for_server(self, *, url: str, timeout: float):
         # run health check
         start = time.time()
@@ -169,7 +180,7 @@ def _wait_for_server(self, *, url: str, timeout: float):
                 # which means the server is not ready yet.
                 # the stack trace is not useful, so we suppress it
                 # by using `raise from None`.
-                result = self.proc.poll()
+                result = self._poll()
                 if result is not None and result != 0:
                     raise RuntimeError("Server exited unexpectedly.") from None
 
@@ -205,6 +216,48 @@ def get_async_client(self, **kwargs):
                                   **kwargs)
 
 
+class RemoteOpenAIServerCustom(RemoteOpenAIServer):
+    """Launch test server with custom child process"""
+
+    def _start_server(self, model: str, vllm_serve_args: list[str],
+                      env_dict: Optional[dict[str, str]]) -> None:
+        self.proc: Process = Process(
+            target=self.child_process_fxn,
+            args=(env_dict, model,
+                  vllm_serve_args))  # type: ignore[assignment]
+        self.proc.start()
+
+    def __init__(self,
+                 model: str,
+                 vllm_serve_args: list[str],
+                 child_process_fxn: Callable[
+                     [Optional[dict[str, str]], str, list[str]], None],
+                 *,
+                 env_dict: Optional[dict[str, str]] = None,
+                 seed: Optional[int] = 0,
+                 auto_port: bool = True,
+                 max_wait_seconds: Optional[float] = None) -> None:
+        """Store custom child process function then invoke superclass
+        constructor which will indirectly launch it."""
+        self.child_process_fxn = child_process_fxn
+        super().__init__(model=model,
+                         vllm_serve_args=vllm_serve_args,
+                         env_dict=env_dict,
+                         seed=seed,
+                         auto_port=auto_port,
+                         max_wait_seconds=max_wait_seconds)
+
+    def _poll(self) -> Optional[int]:
+        return self.proc.exitcode
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.terminate()
+        self.proc.join(8)
+        if self.proc.is_alive():
+            # force kill if needed
+            self.proc.kill()
+
+
 def _test_completion(
     client: openai.OpenAI,
     model: str,
diff --git a/tests/v1/logits_processors/__init__.py b/tests/v1/logits_processors/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/v1/sample/test_logits_processors.py b/tests/v1/logits_processors/test_correctness.py
similarity index 97%
rename from tests/v1/sample/test_logits_processors.py
rename to tests/v1/logits_processors/test_correctness.py
index 84ee3b0392b4..43caef79b02f 100644
--- a/tests/v1/sample/test_logits_processors.py
+++ b/tests/v1/logits_processors/test_correctness.py
@@ -9,11 +9,13 @@
 import pytest
 import torch
 
+from tests.utils import create_new_process_for_each_test
 from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
                                    create_penalty_tensor,
                                    create_prompt_tokens_tensor,
                                    fake_apply_logitsprocs,
                                    fake_update_logitsprocs_state)
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
@@ -24,7 +26,7 @@
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
                                              MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             build_logitsprocs)
 # yapf: enable
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -53,6 +55,7 @@ class LogitsProcsRequestParams:
     workload_index: int
     logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
+    prompt_tokens: list[int]  # Dummy prompt tokens placeholder
     params: SamplingParams  # Settings customized for logitproc
 
     def __init__(self, workload_index: int, logitproc_type: LogitprocType):
@@ -63,6 +66,7 @@ def __init__(self, workload_index: int, logitproc_type: LogitprocType):
         # don't matter *for these tests* so use 0 as a dummy value
         self.out_tokens = ([0] *
                            (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.prompt_tokens = []
         self.params = _sampling_params_from_logitproc(logitproc_type)
 
     def __str__(self):
@@ -88,11 +92,12 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = init_builtin_logitsprocs(
-        pin_memory_available=PIN_MEMORY_AVAILABLE,
-        max_num_reqs=MAX_NUM_REQS + 1,
-        device=device)
-
+    logitsprocs = build_logitsprocs(
+        vllm_config=VllmConfig(),
+        device=device,
+        is_pin_memory=PIN_MEMORY_AVAILABLE,
+        is_pooling_model=False,
+    )
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,
@@ -462,7 +467,8 @@ def _generate_fake_step_update(
         # Replace as many removed requests as possible with added requests
         add_remove_idx = batch_update_builder.pop_removed()
         batch_update_builder.added.append(
-            (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
+            (add_remove_idx, add_req_params.params,
+             add_req_params.prompt_tokens, add_req_params.out_tokens))
         persistent_batch[add_remove_idx] = add_req_params
 
     # Append remaining added requests to end of batch
@@ -470,7 +476,8 @@ def _generate_fake_step_update(
                                        num_step_add_replace):(wdx +
                                                               num_step_add)]
     batch_update_builder.added.extend([
-        (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
+        (adx + batch_size, add_req_params.params, add_req_params.prompt_tokens,
+         add_req_params.out_tokens)
         for adx, add_req_params in enumerate(add_reqs_append)
     ])
     persistent_batch.extend(add_reqs_append)
@@ -561,6 +568,7 @@ def _assert_valid(
             step_idx=step_idx)
 
 
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py
new file mode 100644
index 000000000000..a7fde1990f7e
--- /dev/null
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import random
+import sys
+from typing import Union
+
+import pytest
+
+from tests.utils import create_new_process_for_each_test
+# yapf: disable
+from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
+                                              DUMMY_LOGITPROC_FQCN,
+                                              DUMMY_LOGITPROC_MODULE,
+                                              MAX_TOKENS, MODEL_NAME,
+                                              POOLING_MODEL_NAME, TEMP_GREEDY,
+                                              CustomLogitprocSource,
+                                              DummyLogitsProcessor,
+                                              dummy_module)
+from tests.v1.logits_processors.utils import entry_points as fake_entry_points
+from tests.v1.logits_processors.utils import prompts
+# yapf: enable
+from vllm import LLM, SamplingParams
+from vllm.v1.sample.logits_processor import (STR_POOLING_REJECTS_LOGITSPROCS,
+                                             LogitsProcessor)
+
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=TEMP_GREEDY,
+                   max_tokens=MAX_TOKENS,
+                   extra_args={DUMMY_LOGITPROC_ARG: 128}),
+    SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
+    SamplingParams(temperature=TEMP_GREEDY,
+                   max_tokens=MAX_TOKENS,
+                   extra_args={DUMMY_LOGITPROC_ARG: 67}),
+    SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
+]
+
+
+def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
+    """Compare `LLM` instance initialized with specified `kwargs` against
+    reference `LLM` instance.
+
+    Two scenarios:
+    1. Server has loaded dummy logitproc; test that requests which specify
+       dummy logitproc arg value behave as if logitproc is operating (output
+       token value should repeat), while requests that don't specify dummy
+       logitproc arg value should match reference `LLM` output.
+    2. Server has *not* loaded dummy logitproc; test that all requests
+       behave as if logitproc is *not* operating (output matches reference
+       `LLM` output.)
+    
+    Args:
+      kwargs: `LLM` constructor kwargs
+      logitproc_loaded: server has loaded dummy logitproc if True
+    """
+
+    # Create a vLLM instance and load custom logitproc
+    llm_logitproc = LLM(
+        model=MODEL_NAME,
+        gpu_memory_utilization=0.1,
+        **kwargs,
+    )
+
+    # Create a reference vLLM instance without custom logitproc
+    llm_ref = LLM(model=MODEL_NAME, gpu_memory_utilization=0.1)
+
+    # Run inference with logitproc loaded
+    outputs_logitproc = llm_logitproc.generate(prompts, sampling_params_list)
+
+    # Reference run
+    outputs_ref = llm_ref.generate(prompts, sampling_params_list)
+
+    # Validate outputs
+    for bdx, (out_lp, out_ref, params) in enumerate(
+            zip(outputs_logitproc, outputs_ref, sampling_params_list)):
+        lp_toks = out_lp.outputs[0].token_ids
+        if logitproc_loaded and params.extra_args:
+            # This request exercises custom logitproc; validate that logitproc
+            # forces `target_token` to be decoded in each step
+            target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
+            if not all(x == target_token for x in lp_toks):
+                raise AssertionError(
+                    f"Request {bdx} generated {lp_toks}, shoud all be "
+                    f"{target_token}")
+        else:
+            # This request does not exercise custom logitproc (or custom
+            # logitproc is not enabled on this server); validate against
+            # reference result
+            ref_toks = out_ref.outputs[0].token_ids
+            if lp_toks != ref_toks:
+                raise AssertionError(
+                    f"Request {bdx} generated {lp_toks}, should match "
+                    f"{ref_toks}")
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("logitproc_source", list(CustomLogitprocSource))
+def test_custom_logitsprocs(monkeypatch,
+                            logitproc_source: CustomLogitprocSource):
+    """Test offline Python interface for passing custom logitsprocs
+    
+    Construct an `LLM` instance which loads a custom logitproc that has a
+    well-defined behavior (mask out all tokens except one `target_token`)
+
+    Construct a reference `LLM` instance with no custom logitproc
+
+    Pass in a batch of requests, 50% of which pass a `target_token` value
+    in through `SamplingParams.extra_args`, 50% of which do not.
+
+    Validate that
+    * Requests which do not activate the custom logitproc, yield the same
+      results for both `LLM` instances
+    * Requests which activate the custom logitproc, only output `target_token`
+
+    Test four scenarios, corresponding to `logitproc_source` value
+    * No logitsprocs loaded - test that generated tokens match reference `LLM`
+      instance output
+    * Logitproc passed in via {entrypoint, class object, fully-qualified class
+      name (FQCN)} - test that dummy logitproc is utilized correctly when
+      provided via any of these three possible sources 
+
+    Args:
+      monkeypatch: for setting env vars
+      logitproc_source: what source (entrypoint, fully-qualified class name
+                        (FQCN), class object, or None) the user pulls the
+                        logitproc from
+    """
+
+    # Test that logitproc info is passed to workers
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+    random.seed(40)
+
+    # Choose LLM args based on logitproc source
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_NONE:
+        # Scenario: the server does not load any custom logitproc
+        # Every other scenario is a different way of loading a custom logitproc
+        _run_test({}, logitproc_loaded=False)
+        return
+
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
+        # Scenario: vLLM loads a logitproc from a preconfigured entrypoint
+        # To that end, mock a dummy logitproc entrypoint
+        import importlib.metadata
+        importlib.metadata.entry_points = fake_entry_points  # type: ignore
+
+        # fork is required for workers to see entrypoint patch
+        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
+        _run_test({}, logitproc_loaded=True)
+        return
+
+    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
+        # Scenario: load logitproc based on fully-qualified class name (FQCN)
+        # Inject dummy module which defines logitproc
+        sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
+        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
+    elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
+        # Scenario: load logitproc from provided class object
+        kwargs["logits_processors"] = [DummyLogitsProcessor]
+
+    _run_test(kwargs, logitproc_loaded=True)
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("logitproc_source", [
+    CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,
+    CustomLogitprocSource.LOGITPROC_SOURCE_FQCN,
+    CustomLogitprocSource.LOGITPROC_SOURCE_CLASS,
+])
+def test_pooling_rejects_custom_logitsprocs(
+        monkeypatch, logitproc_source: CustomLogitprocSource):
+    """Validate that vLLM engine initialization properly rejects custom
+    logitsprocs when the model is a pooling model.
+
+    Use `LLM` entrypoint. We expect `LLM` initialization to fail before the
+    logitproc is actually loaded.
+
+    Scenario 1:
+    * Mock a logitproc entrypoint
+    * Validate that `LLM` does not load the logitproc
+
+    Scenario 2:
+    * Pass custom logitproc to `LLM` constructor
+      * Scenario 2a: via FQCN
+      * Scenario 2b: via class object
+    * Validate that initialization fails with appropriate exception
+
+    Args:
+      monkeypatch: used to set environment variables
+      logitproc_source: what source (entrypoint, fully-qualified class name
+                        (FQCN), or class object) the user pulls the
+                        logitproc from
+    """
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    random.seed(40)
+
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
+        # Scenario: vLLM loads a pooling model and ignores a logitproc that is
+        # available at a preconfigured entrypoint
+
+        # Patch in dummy logitproc entrypoint
+        import importlib.metadata
+        importlib.metadata.entry_points = fake_entry_points  # type: ignore
+
+        # fork is required for entrypoint patch to be visible to workers,
+        # although they should ignore the entrypoint patch anyway
+        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
+
+        llm = LLM(
+            runner="pooling",
+            model=POOLING_MODEL_NAME,
+            gpu_memory_utilization=0.1,
+        )
+        # Require that no logitsprocs have been loaded
+        assert sum([
+            1 for _ in llm.llm_engine.model_executor.driver_worker.worker.
+            model_runner.input_batch.logitsprocs.all
+        ]) == 0
+        return
+
+    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
+    if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
+        # Scenario: load logitproc based on fully-qualified class name (FQCN)
+        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
+    elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
+        # Scenario: load logitproc from provided class object
+        kwargs["logits_processors"] = [DummyLogitsProcessor]
+
+    with pytest.raises(ValueError, match=STR_POOLING_REJECTS_LOGITSPROCS):
+        # Require that loading a pooling model alongside the logitproc raises
+        # the appropriate exception.
+        LLM(
+            runner="pooling",
+            model=POOLING_MODEL_NAME,
+            gpu_memory_utilization=0.1,
+            **kwargs,
+        )
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
new file mode 100644
index 000000000000..a01a479e5b24
--- /dev/null
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import random
+import sys
+from typing import Any, Optional
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import (RemoteOpenAIServerCustom,
+                         create_new_process_for_each_test)
+# yapf: disable
+from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
+                                              DUMMY_LOGITPROC_FQCN,
+                                              DUMMY_LOGITPROC_MODULE,
+                                              MAX_TOKENS, MODEL_NAME,
+                                              TEMP_GREEDY, dummy_module)
+from tests.v1.logits_processors.utils import entry_points as fake_entry_points
+from tests.v1.logits_processors.utils import prompts
+
+# yapf: enable
+
+
+def _server_with_logitproc_entrypoint(
+    env_dict: Optional[dict[str, str]],
+    model: str,
+    vllm_serve_args: list[str],
+) -> None:
+    """Start vLLM server, inject dummy logitproc entrypoint"""
+
+    # Patch `entry_points` to inject logitproc entrypoint
+    import importlib.metadata
+    importlib.metadata.entry_points = fake_entry_points  # type: ignore
+    from vllm.entrypoints.cli import main
+
+    # fork is required for workers to see entrypoint patch
+    os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork"
+    if env_dict is not None:
+        os.environ.update(env_dict)
+
+    # Emulate `vllm serve <model> <CLI args>`
+    sys.argv = ["vllm", "serve", model] + vllm_serve_args
+    main.main()
+
+
+def _server_with_logitproc_module(
+    env_dict: Optional[dict[str, str]],
+    model: str,
+    vllm_serve_args: list[str],
+) -> None:
+    """Start vLLM server, inject module with dummy logitproc"""
+
+    # Patch `modules` to inject dummy logitproc module
+    from vllm.entrypoints.cli import main
+    sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
+
+    # fork is required for workers to see entrypoint patch
+    os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "fork"
+    if env_dict is not None:
+        os.environ.update(env_dict)
+
+    # Emulate `vllm serve <model> <CLI args>`
+    sys.argv = ["vllm", "serve", model] + vllm_serve_args
+    main.main()
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+    ]
+
+
+@pytest.fixture(scope="function",
+                params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]])
+def server(default_server_args, request, monkeypatch):
+    """Consider two server configurations:
+    (1) --logits-processors cli arg specifies dummy logits processor via fully-
+    qualified class name (FQCN); patch in a dummy logits processor module
+    (2) No --logits-processors cli arg; patch in a dummy logits processor
+    entrypoint
+    """
+
+    # Test that logitproc info is passed to workers
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+
+    if request.param:
+        # Launch server, append FQCN argument, inject dummy logitproc module
+        args = default_server_args + request.param
+        _server_fxn = _server_with_logitproc_module
+    else:
+        # Launch server, inject dummy logitproc entrypoint
+        args = default_server_args
+        _server_fxn = _server_with_logitproc_entrypoint
+
+    with RemoteOpenAIServerCustom(MODEL_NAME, args,
+                                  _server_fxn) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+# General request argument values for these tests
+api_keyword_args = {
+    # Greedy sampling ensures that requests which receive the `target_token`
+    # arg will decode it in every step
+    "temperature": TEMP_GREEDY,
+    # Since EOS will never be decoded (unless `target_token` is EOS)
+    "max_tokens": MAX_TOKENS,
+    # Return decoded token logprobs (as a way of getting token id)
+    "logprobs": 0,
+}
+
+
+@create_new_process_for_each_test()
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
+    """Test custom logitsprocs when starting OpenAI server from CLI
+    
+    Launch vLLM OpenAI-compatible server, configured to load a custom logitproc
+    that has a well-defined behavior (mask out all tokens except one
+    `target_token`).
+
+    Pass in requests, 50% of which pass a `target_token` value
+    in through `extra_body["vllm_xargs"]`, 50% of which do not.
+
+    Validate that requests which activate the custom logitproc, repeat the same
+    token
+    """
+
+    use_dummy_logitproc = True
+    for prompt in prompts:
+        # Build request arguments
+        request_keyword_args: dict[str, Any] = {
+            **api_keyword_args,
+        }
+        if use_dummy_logitproc:
+            # 50% of requests pass target_token custom arg
+            target_token = random.choice([128, 67])
+            # For requests which activate the dummy logitproc, choose one of
+            # two `target_token` values which are known not to be EOS tokens
+            request_keyword_args["extra_body"] = {
+                "vllm_xargs": {
+                    DUMMY_LOGITPROC_ARG: target_token
+                }
+            }
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            **request_keyword_args,
+        )
+
+        if use_dummy_logitproc:
+            # Only for requests which activate dummy logitproc - validate that
+            # output token is repeated
+            choices: openai.types.CompletionChoice = batch.choices
+            toks = choices[0].logprobs.tokens
+            if not all([x == toks[0] for x in toks]):
+                raise AssertionError(
+                    f"Generated {toks} should all be {toks[0]}")
+
+        # Alternate whether to activate dummy logitproc for each request
+        use_dummy_logitproc = not use_dummy_logitproc
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
new file mode 100644
index 000000000000..c0bfc1a18fec
--- /dev/null
+++ b/tests/v1/logits_processors/utils.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import types
+from enum import Enum, auto
+from typing import Optional
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.sampling_params import SamplingParams
+from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
+                                             LogitsProcessor,
+                                             MoveDirectionality)
+
+MODEL_NAME = "facebook/opt-125m"
+POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
+DUMMY_LOGITPROC_ARG = "target_token"
+TEMP_GREEDY = 0.0
+MAX_TOKENS = 20
+DUMMY_LOGITPROC_ENTRYPOINT = "dummy_logitproc"
+DUMMY_LOGITPROC_MODULE = "DummyModule"
+DUMMY_LOGITPROC_FQCN = f"{DUMMY_LOGITPROC_MODULE}:DummyLogitsProcessor"
+
+
+class CustomLogitprocSource(Enum):
+    """How to source a logitproc for testing purposes"""
+    LOGITPROC_SOURCE_NONE = auto()  # No custom logitproc
+    LOGITPROC_SOURCE_ENTRYPOINT = auto()  # Via entrypoint
+    LOGITPROC_SOURCE_FQCN = auto()  # Via fully-qualified class name (FQCN)
+    LOGITPROC_SOURCE_CLASS = auto()  # Via provided class object
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+class DummyLogitsProcessor(LogitsProcessor):
+    """Fake logit processor to support unit testing and examples"""
+
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
+        self.req_info: dict[int, SamplingParams] = {}
+
+    def is_argmax_invariant(self) -> bool:
+        """Never impacts greedy sampling"""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        for index, params, _, _ in batch_update.added:
+            assert params is not None
+            if params.extra_args and (target_token :=
+                                      params.extra_args.get("target_token")):
+                self.req_info[index] = target_token
+
+        if self.req_info:
+            # Process removed requests.
+            for index in batch_update.removed:
+                self.req_info.pop(index, None)
+
+            # Process moved requests, unidirectional move (a->b) and swap
+            # (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                a_val = self.req_info.pop(adx, None)
+                b_val = self.req_info.pop(bdx, None)
+                if a_val is not None:
+                    self.req_info[bdx] = a_val
+                if direct == MoveDirectionality.SWAP and b_val is not None:
+                    self.req_info[adx] = b_val
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.req_info:
+            return logits
+
+        # Save target values before modification
+        rows_list = list(self.req_info.keys())
+        cols = torch.tensor([self.req_info[i] for i in rows_list],
+                            dtype=torch.long,
+                            device=logits.device)
+        rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device)
+        values_to_keep = logits[rows, cols].clone()
+
+        # Mask all but target tokens
+        logits[rows] = float('-inf')
+        logits[rows, cols] = values_to_keep
+
+        return logits
+
+
+"""Dummy module with dummy logitproc class"""
+dummy_module = types.ModuleType(DUMMY_LOGITPROC_MODULE)
+dummy_module.DummyLogitsProcessor = DummyLogitsProcessor  # type: ignore
+
+
+class EntryPoint:
+    """Dummy entrypoint class for logitsprocs testing"""
+
+    def __init__(self):
+        self.name = DUMMY_LOGITPROC_ENTRYPOINT
+        self.value = DUMMY_LOGITPROC_FQCN
+
+    def load(self):
+        return DummyLogitsProcessor
+
+
+class EntryPoints(list):
+    """Dummy EntryPoints class for logitsprocs testing"""
+
+    def __init__(self, group: str):
+        # Emulate list-like functionality
+        eps = [EntryPoint()] if group == LOGITSPROCS_GROUP else []
+        super().__init__(eps)
+        # Extra attributes
+        self.names = [ep.name for ep in eps]
+
+
+"""Fake version of importlib.metadata.entry_points"""
+entry_points = lambda group: EntryPoints(group)
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 3a4d48afc9d7..4e912f98f376 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 
 from vllm.platforms import current_platform
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
                                               RejectionSampler)
@@ -69,7 +69,7 @@ def create_sampling_metadata(
         output_token_ids=[],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessors(),
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 31c6c881d7b8..53215f88bb27 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -9,7 +9,7 @@
 
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import Sampler
 
@@ -173,7 +173,7 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessors(),
     )
     return fake_sampling_metadata
 
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 74ab19a3ce32..d7b4746562be 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -13,7 +13,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -169,7 +169,7 @@ def _construct_expected_sampling_metadata(
                       and all(x == 1 for x in repetition_penalties)),
         allowed_token_ids_mask=allowed_token_ids_mask,
         bad_words_token_ids=bad_words_token_ids,
-        logitsprocs=LogitsProcessorManager(),
+        logitsprocs=LogitsProcessors(),
     )
 
 
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 14fc5589a89a..51db277f65dc 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -62,6 +62,7 @@
         QuantizationConfig)
     from vllm.model_executor.model_loader import LoadFormats
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+    from vllm.v1.sample.logits_processor import LogitsProcessor
 
     HfOverrides = Union[dict, Callable[[type], type]]
 else:
@@ -72,6 +73,7 @@
     BaseModelLoader = Any
     LoadFormats = Any
     TensorizerConfig = Any
+    LogitsProcessor = Any
     HfOverrides = Union[dict[str, Any], Callable[[type], type]]
 
     me_quant = LazyLoader("model_executor", globals(),
@@ -465,6 +467,9 @@ class ModelConfig:
     - "transformers" will use the Transformers model implementation."""
     override_attention_dtype: Optional[str] = None
     """Override dtype for attention"""
+    logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
+    """One or more logits processors' fully-qualified class names or class
+    definitions"""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 630fbec4539e..6fc894827c4a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -43,6 +43,7 @@
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
+from vllm.v1.sample.logits_processor import LogitsProcessor
 
 # yapf: enable
 
@@ -435,6 +436,10 @@ class EngineArgs:
     enable_multimodal_encoder_data_parallel: bool = \
         ParallelConfig.enable_multimodal_encoder_data_parallel
 
+    logits_processors: Optional[list[Union[
+        str, type[LogitsProcessor]]]] = ModelConfig.logits_processors
+    """Custom logitproc types"""
+
     async_scheduling: bool = SchedulerConfig.async_scheduling
     # DEPRECATED
     enable_prompt_adapter: bool = False
@@ -549,6 +554,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                  **model_kwargs["model_impl"])
         model_group.add_argument("--override-attention-dtype",
                                  **model_kwargs["override_attention_dtype"])
+        model_group.add_argument("--logits-processors",
+                                 **model_kwargs["logits_processors"])
 
         # Model loading arguments
         load_kwargs = get_kwargs(LoadConfig)
@@ -940,6 +947,7 @@ def create_model_config(self) -> ModelConfig:
             enable_sleep_mode=self.enable_sleep_mode,
             model_impl=self.model_impl,
             override_attention_dtype=self.override_attention_dtype,
+            logits_processors=self.logits_processors,
         )
 
     def validate_tensorizer_args(self):
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 915f14a29b90..b002f234c043 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -55,6 +55,7 @@
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of
+from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
     from vllm.v1.metrics.reader import Metric
@@ -198,6 +199,8 @@ def __init__(
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
+        logits_processors: Optional[list[Union[str,
+                                               type[LogitsProcessor]]]] = None,
         **kwargs,
     ) -> None:
         """LLM constructor."""
@@ -272,6 +275,7 @@ def __init__(
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
+            logits_processors=logits_processors,
             **kwargs,
         )
 
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 64f7426bd65d..5cb9f97ae0b0 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -2562,7 +2562,7 @@ def direct_register_custom_op(
 
 def resolve_obj_by_qualname(qualname: str) -> Any:
     """
-    Resolve an object by its fully qualified name.
+    Resolve an object by its fully-qualified class name.
     """
     module_name, obj_name = qualname.rsplit(".", 1)
     module = importlib.import_module(module_name)
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
new file mode 100644
index 000000000000..822026916295
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+import itertools
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
+                                                     MinPLogitsProcessor,
+                                                     MinTokensLogitsProcessor)
+from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
+                                                       LogitsProcessor,
+                                                       MoveDirectionality)
+from vllm.v1.sample.logits_processor.state import (BatchUpdateBuilder,
+                                                   LogitsProcessors)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+logger = init_logger(__name__)
+
+# Error message when the user tries to initialize vLLM with a pooling model
+# and custom logitsproces
+STR_POOLING_REJECTS_LOGITSPROCS = ("Pooling models do not support custom"
+                                   " logits processors.")
+
+LOGITSPROCS_GROUP = 'vllm.logits_processors'
+
+BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [
+    MinTokensLogitsProcessor,
+    LogitBiasLogitsProcessor,
+    MinPLogitsProcessor,
+]
+
+
+def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]:
+    """Load all installed logit processor plugins"""
+
+    import sys
+
+    if sys.version_info < (3, 10):
+        from importlib_metadata import entry_points
+    else:
+        from importlib.metadata import entry_points
+
+    installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP)
+    if len(installed_logitsprocs_plugins) == 0:
+        logger.debug("No logitsprocs plugins installed (group %s).",
+                     LOGITSPROCS_GROUP)
+        return []
+
+    # Load logitsprocs plugins
+    logger.debug("Loading installed logitsprocs plugins (group %s):",
+                 LOGITSPROCS_GROUP)
+    classes: list[type[LogitsProcessor]] = []
+    for entrypoint in installed_logitsprocs_plugins:
+        try:
+            logger.debug("- Loading logitproc plugin entrypoint=%s target=%s",
+                         entrypoint.name, entrypoint.value)
+            classes.append(entrypoint.load())
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load LogitsProcessor plugin {entrypoint}") from e
+    return classes
+
+
+def _load_logitsprocs_by_fqcns(
+    logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]]
+) -> list[type[LogitsProcessor]]:
+    """Load logit processor types, identifying them by fully-qualified class
+    names (FQCNs).
+
+    Effectively, a mixed list of logitproc types and FQCN strings is converted
+    into a list of entirely logitproc types, by loading from the FQCNs.
+
+    FQCN syntax is <module>:<type> i.e. x.y.z:CustomLogitProc
+
+    Already-loaded logitproc types must be subclasses of LogitsProcessor
+
+    Args:
+      logits_processors: Potentially mixed list of logitsprocs types and FQCN
+                         strings for logitproc types
+
+    Returns:
+      List of logitproc types
+
+    """
+    if not logits_processors:
+        return []
+
+    logger.debug(
+        "%s additional custom logits processors specified, checking whether "
+        "they need to be loaded.", len(logits_processors))
+
+    classes: list[type[LogitsProcessor]] = []
+    for ldx, logitproc in enumerate(logits_processors):
+        if isinstance(logitproc, type):
+            logger.debug(" - Already-loaded logit processor: %s",
+                         logitproc.__name__)
+            if not issubclass(logitproc, LogitsProcessor):
+                raise ValueError(
+                    f"{logitproc.__name__} is not a subclass of LogitsProcessor"
+                )
+            classes.append(logitproc)
+            continue
+
+        logger.debug("- Loading logits processor %s", logitproc)
+        module_path, qualname = logitproc.split(":")
+
+        try:
+            # Load module
+            module = importlib.import_module(module_path)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load {ldx}th LogitsProcessor plugin {logitproc}"
+            ) from e
+
+        # Walk down dotted name to get logitproc class
+        obj = module
+        for attr in qualname.split("."):
+            obj = getattr(obj, attr)
+        if not isinstance(obj, type):
+            raise ValueError("Loaded logit processor must be a type.")
+        if not issubclass(obj, LogitsProcessor):
+            raise ValueError(
+                f"{obj.__name__} must be a subclass of LogitsProcessor")
+        classes.append(obj)
+
+    return classes
+
+
+def _load_custom_logitsprocs(
+    logits_processors: Optional[Sequence[Union[str, type[LogitsProcessor]]]],
+) -> list[type[LogitsProcessor]]:
+    """Load all custom logits processors.
+
+    * First load all installed logitproc plugins
+    * Second load custom logitsprocs pass by the user at initialization time
+
+    Args:
+      logits_processors: potentially mixed list of logitproc types and
+                         logitproc type fully-qualified names (FQCNs)
+                         which need to be loaded
+
+    Returns:
+      A list of all loaded logitproc types
+    """
+    from vllm.platforms import current_platform
+    if current_platform.is_tpu():
+        # No logitsprocs specified by caller
+        # TODO(andy) - vLLM V1 on TPU does not support custom logitsprocs
+        return []
+
+    return (_load_logitsprocs_plugins() +
+            _load_logitsprocs_by_fqcns(logits_processors))
+
+
+def build_logitsprocs(
+    vllm_config: "VllmConfig",
+    device: torch.device,
+    is_pin_memory: bool,
+    is_pooling_model: bool,
+    custom_logitsprocs: Sequence[Union[str, type[LogitsProcessor]]] = (),
+) -> LogitsProcessors:
+    if is_pooling_model:
+        if custom_logitsprocs:
+            raise ValueError(STR_POOLING_REJECTS_LOGITSPROCS)
+        logger.debug("Skipping logits processor loading because pooling models"
+                     " do not support logits processors.")
+        return LogitsProcessors()
+    custom_logitsprocs_classes = _load_custom_logitsprocs(custom_logitsprocs)
+    return LogitsProcessors(
+        ctor(vllm_config, device, is_pin_memory) for ctor in itertools.chain(
+            BUILTIN_LOGITS_PROCESSORS, custom_logitsprocs_classes))
+
+
+__all__ = [
+    "LogitsProcessor", "LogitBiasLogitsProcessor", "MinPLogitsProcessor",
+    "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder",
+    "MoveDirectionality", "LogitsProcessors", "build_logitsprocs",
+    "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP"
+]
diff --git a/vllm/v1/sample/logits_processor.py b/vllm/v1/sample/logits_processor/builtin.py
similarity index 54%
rename from vllm/v1/sample/logits_processor.py
rename to vllm/v1/sample/logits_processor/builtin.py
index 3a06e71057cd..24387ab79390 100644
--- a/vllm/v1/sample/logits_processor.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -1,241 +1,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import dataclasses
-from abc import ABC, abstractmethod
-from collections.abc import Iterator, Sequence
-from dataclasses import dataclass, field
-from enum import Enum
-from itertools import chain
-from typing import Optional, Union
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional
 
 import torch
-from torch._prims_common import DeviceLikeType
-
-from vllm import PoolingParams, SamplingParams
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class MoveDirectionality(Enum):
-    # One-way i1->i2 req move within batch
-    UNIDIRECTIONAL = 0
-    # Two-way i1<->i2 req swap within batch
-    SWAP = 1
-
-
-# (index, params, output_tok_ids) tuples for new
-# requests added to the batch.
-AddedRequest = tuple[int, Union[SamplingParams, PoolingParams], list[int]]
-# (index 1, index 2, directionality) tuples representing
-# one-way moves or two-way swaps of requests in batch
-MovedRequest = tuple[int, int, MoveDirectionality]
-# Batch indices of any removed requests.
-RemovedRequest = int
-
-
-@dataclasses.dataclass(frozen=True)
-class BatchUpdate:
-    """Persistent batch state change info for logitsprocs"""
-    batch_size: int  # Current num reqs in batch
-
-    # Metadata for requests added to, removed from, and moved
-    # within the persistent batch.
-    #
-    # Note: each added request is represented as
-    # (index, params, output_tok_ids)
-    # Key assumption: output_tok_ids is a reference to the
-    # request's running output tokens list; in this way
-    # the logits processors always see the latest list of
-    # generated tokens
-    removed: Sequence[RemovedRequest]
-    moved: Sequence[MovedRequest]
-    added: Sequence[AddedRequest]
-
-
-class BatchUpdateBuilder:
-    """Helps track persistent batch state changes and build
-    a batch update data structure for logitsprocs
-    
-    Assumptions:
-    * All information about requests removed from persistent batch
-      during a step is aggregated in self._removed through calls to
-      self.removed_append() at the beginning of a step. This must happen
-      before the first time that self.removed, self.pop_removed()
-      or self.peek_removed() are invoked in a given step
-    * After the first time that self.removed, self.pop_removed()
-      or self.peek_removed() are read in a step, no new removals
-      are registered using self.removed_append()
-    * Elements of self._removed are never directly modified, added or
-      removed (i.e. modification is only via self.removed_append() and
-      self.pop_removed())
-    
-    Guarantees under above assumptions:
-    * self.removed is always sorted in descending order
-    * self.pop_removed() and self.peek_removed() both return
-      the lowest removed request index in the current step
-    """
-
-    _removed: list[RemovedRequest]
-    _is_removed_sorted: bool
-    moved: list[MovedRequest]
-    added: list[AddedRequest]
-
-    def __init__(
-        self,
-        removed: Optional[list[RemovedRequest]] = None,
-        moved: Optional[list[MovedRequest]] = None,
-        added: Optional[list[AddedRequest]] = None,
-    ) -> None:
-        self._removed = removed or []
-        self.moved = moved or []
-        self.added = added or []
-        self._is_removed_sorted = False
-
-    def _ensure_removed_sorted(self) -> None:
-        """Sort removed request indices in
-        descending order.
-        
-        Idempotent after first call in a
-        given step, until reset.
-        """
-        if not self._is_removed_sorted:
-            self._removed.sort(reverse=True)
-            self._is_removed_sorted = True
-
-    @property
-    def removed(self) -> list[RemovedRequest]:
-        """Removed request indices sorted in
-        descending order"""
-        self._ensure_removed_sorted()
-        return self._removed
-
-    def removed_append(self, index: int) -> None:
-        """Register the removal of a request from
-        the persistent batch.
-
-        Must not be called after the first time
-        self.removed, self.pop_removed() or
-        self.peek_removed() are invoked.
-        
-        Args:
-          index: request index
-        """
-        if self._is_removed_sorted:
-            raise RuntimeError("Cannot register new removed request after"
-                               " self.removed has been read.")
-        self._removed.append(index)
-
-    def has_removed(self) -> bool:
-        return bool(self._removed)
-
-    def peek_removed(self) -> Optional[int]:
-        """Return lowest removed request index"""
-        if self.has_removed():
-            self._ensure_removed_sorted()
-            return self._removed[-1]
-        return None
-
-    def pop_removed(self) -> Optional[int]:
-        """Pop lowest removed request index"""
-        if self.has_removed():
-            self._ensure_removed_sorted()
-            return self._removed.pop()
-        return None
-
-    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
-        """Generate a logitsprocs batch update data structure
-        and reset internal batch update builder state.
-        
-        Args:
-          batch_size: current persistent batch size
-
-        Returns:
-          Frozen logitsprocs batch update instance; `None` if no updates
-        """
-        # Reset removal-sorting logic
-        self._is_removed_sorted = False
-        if not any((self._removed, self.moved, self.added)):
-            # No update; short-circuit
-            return None
-        # Build batch state update
-        batch_update = BatchUpdate(
-            batch_size=batch_size,
-            removed=self._removed,
-            moved=self.moved,
-            added=self.added,
-        )
-        # Reset removed/moved/added update lists
-        self._removed = []
-        self.moved = []
-        self.added = []
-        return batch_update
-
-
-class LogitsProcessor(ABC):
-
-    @abstractmethod
-    def apply(self, logits: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
 
-    @abstractmethod
-    def is_argmax_invariant(self) -> bool:
-        """True if logits processor has no impact on the
-        argmax computation in greedy sampling.
-        NOTE: may or may not have the same value for all
-        instances of a given LogitsProcessor subclass,
-        depending on subclass implementation.
-        TODO(andy): won't be utilized until logits
-        processors are user-extensible
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def update_state(
-        self,
-        batch_update: Optional[BatchUpdate],
-    ) -> None:
-        """Called when there are new output tokens, prior
-        to each forward pass.
-
-        Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
-        """
-        raise NotImplementedError
-
-
-@dataclass
-class LogitsProcessorManager:
-    """Encapsulates initialized logitsproc objects."""
-    argmax_invariant: list[LogitsProcessor] = field(
-        default_factory=list)  # argmax-invariant logitsprocs
-    non_argmax_invariant: list[LogitsProcessor] = field(
-        default_factory=list)  # non-argmax-invariant logitsprocs
-
-    @property
-    def all(self) -> Iterator[LogitsProcessor]:
-        """Iterator over all logits processors."""
-        return chain(self.argmax_invariant, self.non_argmax_invariant)
-
-
-###### ----- Built-in LogitsProcessor impls below here
+from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
+                                                       LogitsProcessor,
+                                                       MoveDirectionality)
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
 
 
 class MinPLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, max_num_reqs: int, pin_memory: bool,
-                 device: DeviceLikeType):
-        super().__init__()
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
+        max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         self.min_p_count: int = 0
 
         self.min_p_cpu_tensor = torch.zeros((max_num_reqs, ),
                                             dtype=torch.float32,
                                             device="cpu",
-                                            pin_memory=pin_memory)
+                                            pin_memory=is_pin_memory)
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
 
-        self.use_double_tensor = torch.device("cpu") != torch.device(device)
+        self.use_double_tensor = torch.device(device).type != "cpu"
 
         if self.use_double_tensor:
             # Pre-allocated device tensor
@@ -260,8 +51,8 @@ def update_state(self, batch_update: Optional[BatchUpdate]):
 
         needs_update = False
         # Process added requests.
-        for index, params, _ in batch_update.added:
-            min_p = params.min_p if isinstance(params, SamplingParams) else 0.0
+        for index, params, _, _ in batch_update.added:
+            min_p = params.min_p
             if self.min_p_cpu[index] != min_p:
                 needs_update = True
                 self.min_p_cpu[index] = min_p
@@ -316,11 +107,10 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class LogitBiasLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, pin_memory: bool, device: torch.device):
-        super().__init__()
-        self.biases: dict[int, dict[int, float]] = {}
+    def __init__(self, _, device: torch.device, is_pin_memory: bool):
         self.device = device
-        self.pin_memory = pin_memory
+        self.pin_memory = is_pin_memory
+        self.biases: dict[int, dict[int, float]] = {}
 
         self.bias_tensor: torch.Tensor = torch.tensor(())
         self.logits_slice = (self._device_tensor([], torch.int32),
@@ -337,9 +127,8 @@ def update_state(self, batch_update: Optional[BatchUpdate]):
 
         needs_update: bool = False
         # Process added requests.
-        for index, params, _ in batch_update.added:
-            if isinstance(params, SamplingParams) and (lb :=
-                                                       params.logit_bias):
+        for index, params, _, _ in batch_update.added:
+            if lb := params.logit_bias:
                 self.biases[index] = lb
                 needs_update = True
             else:
@@ -400,12 +189,12 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
 
 class MinTokensLogitsProcessor(LogitsProcessor):
 
-    def __init__(self, pin_memory: bool, device: torch.device):
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool):
         # index -> (min_toks, output_token_ids, stop_token_ids)
-        super().__init__()
-        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
         self.device = device
-        self.pin_memory = pin_memory
+        self.pin_memory = is_pin_memory
+        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
 
         # (req_idx_tensor,eos_tok_id_tensor)
         self.logits_slice: tuple[torch.Tensor,
@@ -424,9 +213,8 @@ def update_state(self, batch_update: Optional[BatchUpdate]):
 
         if batch_update:
             # Process added requests.
-            for index, params, output_tok_ids in batch_update.added:
-                if (isinstance(params, SamplingParams)
-                        and (min_tokens := params.min_tokens)
+            for index, params, _, output_tok_ids in batch_update.added:
+                if ((min_tokens := params.min_tokens)
                         and len(output_tok_ids) < min_tokens):
                     # Replace request metadata at batch index
                     self.min_toks[index] = (min_tokens, output_tok_ids,
@@ -499,35 +287,3 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
-
-
-def init_builtin_logitsprocs(pin_memory_available: bool, max_num_reqs: int,
-                             device: torch.device) -> LogitsProcessorManager:
-    """Construct 'builtin' vLLM logitsprocs which the engine
-    loads by default.
-
-    Args:
-      pin_memory_available: pinned memory is available for use
-                            for use by logitsproc
-      max_num_reqs: ceiling on request count in persistent batch
-      device: inference device
-
-    Returns:
-      Data structure encapsulating loaded logitsprocs
-    """
-    min_tokens_logitproc = MinTokensLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    logit_bias_logitproc = LogitBiasLogitsProcessor(
-        pin_memory=pin_memory_available, device=device)
-    min_p_logitproc = MinPLogitsProcessor(
-        pin_memory=pin_memory_available,
-        device=device,
-        # +1 for temporary swap space
-        max_num_reqs=max_num_reqs + 1)
-    return LogitsProcessorManager(
-        non_argmax_invariant=[
-            min_tokens_logitproc,
-            logit_bias_logitproc,
-        ],
-        argmax_invariant=[min_p_logitproc],
-    )
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
new file mode 100644
index 000000000000..12b4db24bff8
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm import SamplingParams
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+
+class MoveDirectionality(Enum):
+    # One-way i1->i2 req move within batch
+    UNIDIRECTIONAL = auto()
+    # Two-way i1<->i2 req swap within batch
+    SWAP = auto()
+
+
+# (index, params, prompt_tok_ids, output_tok_ids) tuples for new
+# requests added to the batch.
+AddedRequest = tuple[int, SamplingParams, list[int], list[int]]
+
+# (index 1, index 2, directionality) tuples representing
+# one-way moves or two-way swaps of requests in batch
+MovedRequest = tuple[int, int, MoveDirectionality]
+
+# Batch indices of any removed requests.
+RemovedRequest = int
+
+
+@dataclass(frozen=True)
+class BatchUpdate:
+    """Persistent batch state change info for logitsprocs"""
+    batch_size: int  # Current num reqs in batch
+
+    # Metadata for requests added to, removed from, and moved
+    # within the persistent batch.
+    #
+    # Key assumption: the `output_tok_ids` list (which is an element of each
+    # tuple in `added`) is a reference to the request's running output tokens
+    # list; via this reference, the logits processors always see the latest
+    # list of generated output tokens
+    removed: Sequence[RemovedRequest]
+    moved: Sequence[MovedRequest]
+    added: Sequence[AddedRequest]
+
+
+class LogitsProcessor(ABC):
+
+    @abstractmethod
+    def __init__(self, vllm_config: "VllmConfig", device: torch.device,
+                 is_pin_memory: bool) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def is_argmax_invariant(self) -> bool:
+        """True if logits processor has no impact on the
+        argmax computation in greedy sampling.
+        NOTE: may or may not have the same value for all
+        instances of a given LogitsProcessor subclass,
+        depending on subclass implementation.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_state(
+        self,
+        batch_update: Optional["BatchUpdate"],
+    ) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
+
+        Args:
+            batch_update is non-None iff there have been
+            changes to the batch makeup.
+        """
+        raise NotImplementedError
diff --git a/vllm/v1/sample/logits_processor/state.py b/vllm/v1/sample/logits_processor/state.py
new file mode 100644
index 000000000000..0f58b5249695
--- /dev/null
+++ b/vllm/v1/sample/logits_processor/state.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterator
+from itertools import chain
+from typing import TYPE_CHECKING, Optional
+
+from vllm.v1.sample.logits_processor.interface import (AddedRequest,
+                                                       BatchUpdate,
+                                                       MovedRequest,
+                                                       RemovedRequest)
+
+if TYPE_CHECKING:
+    from vllm.v1.sample.logits_processor.interface import LogitsProcessor
+
+
+class BatchUpdateBuilder:
+    """Helps track persistent batch state changes and build
+    a batch update data structure for logitsprocs
+    Assumptions:
+    * All information about requests removed from persistent batch
+      during a step is aggregated in self._removed through calls to
+      self.removed_append() at the beginning of a step. This must happen
+      before the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are invoked in a given step
+    * After the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are read in a step, no new removals
+      are registered using self.removed_append()
+    * Elements of self._removed are never directly modified, added or
+      removed (i.e. modification is only via self.removed_append() and
+      self.pop_removed())
+    Guarantees under above assumptions:
+    * self.removed is always sorted in descending order
+    * self.pop_removed() and self.peek_removed() both return
+      the lowest removed request index in the current step
+    """
+
+    _removed: list[RemovedRequest]
+    _is_removed_sorted: bool
+    moved: list[MovedRequest]
+    added: list[AddedRequest]
+
+    def __init__(
+        self,
+        removed: Optional[list[RemovedRequest]] = None,
+        moved: Optional[list[MovedRequest]] = None,
+        added: Optional[list[AddedRequest]] = None,
+    ) -> None:
+        self._removed = removed or []
+        self.moved = moved or []
+        self.added = added or []
+        self._is_removed_sorted = False
+
+    def _ensure_removed_sorted(self) -> None:
+        """Sort removed request indices in
+        descending order.
+        Idempotent after first call in a
+        given step, until reset.
+        """
+        if not self._is_removed_sorted:
+            self._removed.sort(reverse=True)
+            self._is_removed_sorted = True
+
+    @property
+    def removed(self) -> list[RemovedRequest]:
+        """Removed request indices sorted in
+        descending order"""
+        self._ensure_removed_sorted()
+        return self._removed
+
+    def removed_append(self, index: int) -> None:
+        """Register the removal of a request from the persistent batch.
+
+        Must not be called after the first time self.removed,
+        self.pop_removed() or self.peek_removed() are invoked.
+
+        Args:
+          index: request index
+        """
+        if self._is_removed_sorted:
+            raise RuntimeError("Cannot register new removed request after"
+                               " self.removed has been read.")
+        self._removed.append(index)
+
+    def has_removed(self) -> bool:
+        return bool(self._removed)
+
+    def peek_removed(self) -> Optional[int]:
+        """Return lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed[-1]
+        return None
+
+    def pop_removed(self) -> Optional[int]:
+        """Pop lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed.pop()
+        return None
+
+    def _is_update(self) -> bool:
+        """True if there is a batch state change"""
+        return any((self._removed, self.moved, self.added))
+
+    def get_and_reset(self, batch_size: int) -> Optional[BatchUpdate]:
+        """Generate a logitsprocs batch update data structure and reset
+        internal batch update builder state.
+        
+        Args:
+          batch_size: current persistent batch size
+
+        Returns:
+          Frozen logitsprocs batch update instance; `None` if no updates
+        """
+        # Reset removal-sorting logic
+        self._is_removed_sorted = False
+        if not self._is_update():
+            # No update; short-circuit
+            return None
+        # Build batch state update
+        batch_update = BatchUpdate(
+            batch_size=batch_size,
+            removed=self._removed,
+            moved=self.moved,
+            added=self.added,
+        )
+        self._removed = []
+        self.moved = []
+        self.added = []
+        return batch_update
+
+
+class LogitsProcessors:
+    """Encapsulates initialized logitsproc objects."""
+
+    def __init__(
+            self,
+            logitsprocs: Optional[Iterator["LogitsProcessor"]] = None) -> None:
+        self.argmax_invariant: list[LogitsProcessor] = []
+        self.non_argmax_invariant: list[LogitsProcessor] = []
+        if logitsprocs:
+            for logitproc in logitsprocs:
+                (self.argmax_invariant if logitproc.is_argmax_invariant() else
+                 self.non_argmax_invariant).append(logitproc)
+
+    @property
+    def all(self) -> Iterator["LogitsProcessor"]:
+        """Iterator over all logits processors."""
+        return chain(self.argmax_invariant, self.non_argmax_invariant)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 1189b12f3077..9d6a87cea3d0 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.logits_processor import LogitsProcessors
 
 
 @dataclass
@@ -40,4 +40,4 @@ class SamplingMetadata:
     bad_words_token_ids: dict[int, list[list[int]]]
 
     # Loaded logits processors
-    logitsprocs: LogitsProcessorManager
+    logitsprocs: LogitsProcessors
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 2469e09f8249..e718d9d5e0fb 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -18,8 +18,8 @@
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import (BatchUpdateBuilder,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             LogitsProcessors,
+                                             MoveDirectionality)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
 from vllm.v1.utils import copy_slice
@@ -78,8 +78,11 @@ def __init__(
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
+        logitsprocs: Optional[LogitsProcessors] = None,
         is_spec_decode: bool = False,
+        is_pooling_model: bool = False,
     ):
+        self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -221,14 +224,6 @@ def __init__(
         # updates. Should reset each step.
         self.batch_update_builder = BatchUpdateBuilder()
 
-        # Define logits processors.
-        # TODO(andy): logits processor list should be extensible via engine
-        # constructor argument; for now the list is fixed.
-        self.logitsprocs = init_builtin_logitsprocs(
-            pin_memory_available=pin_memory,
-            max_num_reqs=max_num_reqs + 1,
-            device=device)
-
         # TODO convert this to LogitsProcessor
         self.has_allowed_token_ids: set[str] = set()
         # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
@@ -244,6 +239,10 @@ def __init__(
 
         self.req_output_token_ids: list[Optional[list[int]]] = []
 
+        # Store provided logitsprocs. If none are provided, initialize empty
+        # data structure
+        self.logitsprocs = logitsprocs or LogitsProcessors()
+
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
 
@@ -255,28 +254,35 @@ def req_ids(self) -> list[str]:
         # while performing state updates to the batch.
         return cast(list[str], self._req_ids)
 
-    def _get_next_add_index(self) -> int:
-        if (req_index := self.batch_update_builder.pop_removed()) is not None:
-            # Fill the empty index.
-            return req_index
-        # Append to end
-        return self.num_reqs
-
     def _register_add_request(self, request: "CachedRequestState") -> int:
-        """Track add-request operations"""
-        req_index = self._get_next_add_index()
-        assert req_index < self.max_num_reqs
-        params = (request.sampling_params
-                  if request.sampling_params else request.pooling_params)
+        """Track add-request operations for logits processors.
+        Not applicable to pooling models.
+        """
+
+        # Detailed added request metadata is only required for non-pooling
+        # models, to support logitsprocs
+        assert request.sampling_params
+
+        # Fill the next empty index if there is one.
+        if (new_req_index := self.batch_update_builder.pop_removed()) is None:
+            # Append to end otherwise.
+            new_req_index = self.num_reqs
+
+        assert new_req_index < self.max_num_reqs
         self.batch_update_builder.added.append(
-            (req_index, params, request.output_token_ids))
-        return req_index
+            (new_req_index, request.sampling_params, request.prompt_token_ids,
+             request.output_token_ids))
+        return new_req_index
 
     def add_request(
         self,
         request: "CachedRequestState",
     ) -> int:
-        req_index = self._register_add_request(request)
+        if not self.is_pooling_model:
+            # New request index bookkeeping for autoregressive models.
+            req_index = self._register_add_request(request)
+        else:
+            req_index = self.num_reqs
 
         req_id = request.req_id
         if req_index == len(self._req_ids):
@@ -411,7 +417,10 @@ def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        self.batch_update_builder.removed_append(req_index)
+        if not self.is_pooling_model:
+            # Autoregressive models require bookkeeping of removed requests to
+            # support logitsprocs.
+            self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
 
@@ -446,6 +455,8 @@ def remove_request(self, req_id: str) -> Optional[int]:
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
+        # For autoregressive models, track detailed request reordering info
+        # to support logitsprocs
         self.batch_update_builder.moved.append(
             (i1, i2, MoveDirectionality.SWAP))
         old_id_i1 = self._req_ids[i1]
@@ -513,11 +524,18 @@ def condense(self) -> None:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
         """
+        num_reqs = self.num_reqs
+
+        if self.is_pooling_model:
+            # Will be contiguous in pooling case, just trim the lists.
+            del self._req_ids[num_reqs:]
+            del self.req_output_token_ids[num_reqs:]
+            return
+
         if not (empty_req_indices := self.batch_update_builder.removed):
             # All removed requests were replaced by added requests, or else no
             # requests were removed at all. No condense() needed
             return
-        num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
             self._req_ids.clear()
@@ -541,6 +559,8 @@ def condense(self) -> None:
             # Move active request down into empty request
             # index.
             self.batch_update_builder.pop_removed()
+            # Autoregressive models require detailed tracking of condense
+            # operations to support logitsprocs
             self.batch_update_builder.moved.append(
                 (last_req_index, empty_index,
                  MoveDirectionality.UNIDIRECTIONAL))
@@ -596,15 +616,20 @@ def condense(self) -> None:
             last_req_index -= 1
 
         # Trim lists to the batch size.
-        del self._req_ids[self.num_reqs:]
-        del self.req_output_token_ids[self.num_reqs:]
+        del self._req_ids[num_reqs:]
+        del self.req_output_token_ids[num_reqs:]
 
     def refresh_metadata(self):
-        """Apply batch updates, reset input batch at end of step
+        """Apply any batch updates to sampling metadata."""
 
-        * Apply batch add/remove/permute to logits procs' states
-        * If batch state is modified, update sampling metadata
-        """
+        if self.is_pooling_model:
+            # Batch changes every step for pooling models.
+            self.sampling_metadata = self._make_sampling_metadata()
+            return
+
+        # For non-pooling models - generate and apply logitsprocs update;
+        # reset batch update tracking.
+        # Update sampling metadata if batch state is changed.
         batch_update = self.batch_update_builder.get_and_reset(self.num_reqs)
         for logit_proc in self.logitsprocs.all:
             logit_proc.update_state(batch_update)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5ee44a82574c..4219d9147ada 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -68,6 +68,7 @@
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
+from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -80,7 +81,6 @@
     KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from ..sample.logits_processor import LogitsProcessorManager
 from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
                     gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
@@ -221,6 +221,11 @@ def __init__(
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
+            logitsprocs=build_logitsprocs(
+                self.vllm_config, self.device, self.pin_memory,
+                self.is_pooling_model,
+                self.vllm_config.model_config.logits_processors),
+            is_pooling_model=self.is_pooling_model,
         )
 
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
@@ -2447,7 +2452,7 @@ def _dummy_sampler_run(
             output_token_ids=[[] for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
-            logitsprocs=LogitsProcessorManager(),
+            logitsprocs=LogitsProcessors(),
         )
         try:
             sampler_output = self.sampler(logits=logits,
@@ -2968,6 +2973,8 @@ def may_reinitialize_input_batch(self,
                 vocab_size=self.model_config.get_vocab_size(),
                 block_sizes=block_sizes,
                 is_spec_decode=bool(self.vllm_config.speculative_config),
+                logitsprocs=self.input_batch.logitsprocs,
+                is_pooling_model=self.is_pooling_model,
             )
 
     def _allocate_kv_cache_tensors(

From 7a3b649dff45107bf82d4ebe82dff56619e1d778 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Date: Sun, 17 Aug 2025 08:41:23 +0800
Subject: [PATCH 142/233] [Bugfix] fix qwen3 moe fp8 accuracy issue (#23031)

Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
---
 vllm/model_executor/layers/quantization/fp8.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a49744913251..f07be0855492 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -125,6 +125,10 @@ def from_config(cls, config: dict[str, Any]) -> "Fp8Config":
         ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
         weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"],
                                                  None)
+        if not ignored_layers:
+            ignored_layers = cls.get_from_keys_or(config,
+                                                  ["modules_to_not_convert"],
+                                                  None)
         return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
                    activation_scheme=activation_scheme,
                    ignored_layers=ignored_layers,

From 023ec4d68d01cf3646443a681ca0b6fdca49cf77 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Sat, 16 Aug 2025 22:16:42 -0400
Subject: [PATCH 143/233] [UX] Separate marlin moe config logic from triton moe
 (#23006)

---
 .../layers/fused_moe/fused_marlin_moe.py      | 20 ++++++-------------
 .../layers/fused_moe/fused_moe.py             |  9 +--------
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index a49d41c18438..3c6ece6737e4 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Fused MoE utilities for GPTQ."""
-import functools
 from typing import Optional
 
 import torch
 
 import vllm._custom_ops as ops
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    moe_align_block_size, try_get_optimal_moe_config)
+from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new, maybe_warn_marlin_atomic_add)
 from vllm.scalar_type import ScalarType, scalar_types
@@ -98,17 +96,11 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
     N = w2.shape[1] * 16
     topk = topk_ids.shape[1]
 
-    get_config_func = functools.partial(
-        try_get_optimal_moe_config,
-        w1.shape,
-        w2.shape,
-        topk_ids.shape[1],
-        None,
-        is_marlin=True,
-    )
-    config = get_config_func(M)
-
-    block_size_m = config["BLOCK_SIZE_M"]
+    # M block size selection logic
+    # TODO: tune this further for specific models
+    for block_size_m in [8, 16, 32, 48, 64]:
+        if M * topk / E / block_size_m < 0.9:
+            break
 
     if global_num_experts == -1:
         global_num_experts = E
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index e58a9e568d4a..3579ca22bafc 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -801,7 +801,6 @@ def get_default_config(
     K: int,
     topk: int,
     dtype: Optional[str],
-    is_marlin: bool,
     block_shape: Optional[list[int]] = None,
 ) -> dict[str, int]:
     if dtype == "fp8_w8a8" and block_shape is not None:
@@ -832,11 +831,6 @@ def get_default_config(
             config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1}
         else:
             config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1}
-    elif is_marlin:
-        for block_size_m in [8, 16, 32, 48, 64]:
-            if M * topk / E / block_size_m < 0.9:
-                break
-        return {"BLOCK_SIZE_M": block_size_m}
     elif M <= E:
         config = {
             "BLOCK_SIZE_M": 16,
@@ -860,7 +854,6 @@ def try_get_optimal_moe_config(
     top_k: int,
     dtype: Optional[str],
     M: int,
-    is_marlin: bool = False,
     block_shape: Optional[list[int]] = None,
 ) -> dict[str, int]:
     from vllm.model_executor.layers.fused_moe import get_config
@@ -883,7 +876,7 @@ def try_get_optimal_moe_config(
         else:
             # Else use the default config
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
-                                        is_marlin, block_shape)
+                                        block_shape)
     return config
 
 
From ef4e620e1230892c2e514ca04eee2c6785897ce5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 17 Aug 2025 12:05:50 +0800
Subject: [PATCH 144/233] [Refactor] Defer tensor data construction in
 MultiModalKwargs (#23030)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/multimodal/test_cache.py                |  2 +-
 tests/v1/test_serial_utils.py                 | 34 +------
 vllm/inputs/registry.py                       |  2 +-
 .../models/prithvi_geospatial_mae.py          |  2 +-
 vllm/multimodal/base.py                       |  2 +-
 vllm/multimodal/cache.py                      |  2 +-
 vllm/multimodal/inputs.py                     | 96 +++++++++++--------
 vllm/multimodal/processing.py                 |  2 +-
 vllm/multimodal/utils.py                      | 12 ++-
 vllm/sequence.py                              |  4 +-
 vllm/v1/serial_utils.py                       | 17 +---
 vllm/v1/worker/gpu_input_batch.py             |  2 +-
 12 files changed, 73 insertions(+), 104 deletions(-)

diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index e07b73bd257d..2149f05b6af0 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -25,7 +25,7 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]):
 
 
 def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
-    return MultiModalKwargs.from_items([
+    return MultiModalKwargs([
         _dummy_item(modality, size_by_key)
         for modality, size_by_key in size_by_key_modality.items()
     ])
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 0ab4e0bf59cf..586276ee08ae 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -100,38 +100,6 @@ class MyRequest(msgspec.Struct):
 
 
 def test_multimodal_kwargs():
-    d = {
-        "foo":
-        torch.zeros(20000, dtype=torch.float16),
-        "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)],
-        "baz": [
-            torch.rand((256), dtype=torch.float16),
-            [
-                torch.rand((1, 12), dtype=torch.float32),
-                torch.rand((3, 5, 7), dtype=torch.float64),
-            ], [torch.rand((4, 4), dtype=torch.float16)]
-        ],
-    }
-
-    # pack mm kwargs into a mock request so that it can be decoded properly
-    req = MyRequest(mm=[MultiModalKwargs(d)])
-
-    encoder = MsgpackEncoder()
-    decoder = MsgpackDecoder(MyRequest)
-
-    encoded = encoder.encode(req)
-
-    assert len(encoded) == 6
-
-    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
-
-    # expected total encoding length, should be 44559, +-20 for minor changes
-    assert 44539 <= total_len <= 44579
-    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
-    assert all(nested_equal(d[k], decoded[k]) for k in d)
-
-
-def test_multimodal_items_by_modality():
     e1 = MultiModalFieldElem("audio", "a0",
                              torch.zeros(1000, dtype=torch.bfloat16),
                              MultiModalBatchedField())
@@ -151,7 +119,7 @@ def test_multimodal_items_by_modality():
     audio = MultiModalKwargsItem.from_elems([e1])
     video = MultiModalKwargsItem.from_elems([e2])
     image = MultiModalKwargsItem.from_elems([e3, e4])
-    mm = MultiModalKwargs.from_items([audio, video, image])
+    mm = MultiModalKwargs([audio, video, image])
 
     # pack mm kwargs into a mock request so that it can be decoded properly
     req = MyRequest([mm])
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index dc3236508348..ef146fdfbf97 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -240,6 +240,6 @@ def dummy_data_for_profiling(
 
         return DummyData(
             seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
-            multi_modal_data=dec_data.multi_modal_data,
+            multi_modal_data=dec_data.multi_modal_data.get_data(),
             multi_modal_placeholders=dec_data.multi_modal_placeholders,
         )
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 20f423cc7603..68488829071f 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -136,7 +136,7 @@ def apply(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=[1],
-            mm_kwargs=MultiModalKwargs.from_items(multimodal_kwargs_items),
+            mm_kwargs=MultiModalKwargs(multimodal_kwargs_items),
             mm_hashes=None,
             mm_placeholders=mm_placeholders,
         )
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 7188ed14c573..ef8f1b2e17b4 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -99,7 +99,7 @@ def from_seq_group(
         seq_mm_placeholders = seq_group.multi_modal_placeholders
 
         if not seq_mm_data or not seq_mm_placeholders:
-            return MultiModalKwargs({}), {}
+            return MultiModalKwargs(), {}
 
         placeholder_maps = dict[str, MultiModalPlaceholderMap]()
 
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 6074a4d54f22..8c4136e06f81 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -46,7 +46,7 @@ def get_leaf_size(
     ) -> int:
         # MultiModalKwargs is not a subclass of dict
         if isinstance(leaf, MultiModalKwargs):
-            return cls.get_item_size(leaf.data, debug=debug)
+            return cls.get_item_size(leaf.get_data(), debug=debug)
 
         # MultiModalKwargsItem is not a subclass of dict
         if isinstance(leaf, MultiModalKwargsItem):
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index a33ce146995d..d3f57cf5338d 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -653,7 +653,7 @@ def dummy(modality: str):
     def from_elems(elems: Sequence[MultiModalFieldElem]):
         return MultiModalKwargsItem({elem.key: elem for elem in elems})
 
-    def __init__(self, data: Mapping[str, MultiModalFieldElem]) -> None:
+    def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None:
         super().__init__(data)
 
         modalities = {elem.modality for elem in self.data.values()}
@@ -668,9 +668,7 @@ def get_data(self) -> Mapping[str, NestedTensors]:
         return {key: elem.data for key, elem in self.items()}
 
 
-# NOTE: UserDict is for V0 compatibility.
-# V1 should access individual items via `get_item`.
-class MultiModalKwargs(UserDict[str, NestedTensors]):
+class MultiModalKwargs:
     """
     A dictionary that represents the keyword arguments to
     [`torch.nn.Module.forward`][].
@@ -714,40 +712,16 @@ def from_hf_inputs(
                 elems = [v[item_idx] for v in elems_in_modality.values()]
                 items.append(MultiModalKwargsItem.from_elems(elems))
 
-        return MultiModalKwargs.from_items(items)
+        return MultiModalKwargs(items)
 
-    @staticmethod
-    def from_items(
-        items: Sequence[MultiModalKwargsItem],
-        *,
-        pin_memory: bool = False,
-    ):
-        """Construct a new
-        [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]
-        from multiple items."""
-        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
-        for item in items:
-            for key, elem in item.items():
-                elems_by_key[key].append(elem)
-
-        data = {
-            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-            for key, elems in elems_by_key.items() if len(elems) > 0
-        }
-
-        return MultiModalKwargs(data, items=items)
-
-    def __init__(
-        self,
-        data: Mapping[str, NestedTensors],
-        *,
-        items: Optional[Sequence[MultiModalKwargsItem]] = None,
-    ) -> None:
-        super().__init__(data)
+    def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None:
+        super().__init__()
 
-        items_by_modality = full_groupby(items or [], key=lambda x: x.modality)
+        items_by_modality = full_groupby(items, key=lambda x: x.modality)
         self._items_by_modality = dict(items_by_modality)
 
+        self._data: Optional[Mapping[str, NestedTensors]] = None
+
     @property
     def modalities(self):
         return self._items_by_modality.keys()
@@ -839,22 +813,41 @@ def as_kwargs(
 
         return cast(BatchedTensorInputs, json_mapped)
 
-    def __delitem__(self, key: str) -> None:
-        super().__delitem__(key)
+    def keys(self):
+        return self.get_data().keys()
+
+    def values(self):
+        return self.get_data().values()
+
+    def items(self):
+        return self.get_data().items()
+
+    def get(self, key: str, /, default=None):
+        return self.get_data().get(key, default)
+
+    def pop(self, key: str, *args, **kwargs):
+        data = dict(self.get_data())
+        res = data.pop(key, *args, **kwargs)
 
         for items in self._items_by_modality.values():
             for item in items:
-                item.pop(key, None)
+                item.pop(key, *args, **kwargs)
+
+        self._data = None
+
+        return res
+
+    def __iter__(self):
+        return iter(self.get_data())
+
+    def __getitem__(self, key: str):
+        return self.get_data()[key]
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
-        if self._items_by_modality != other._items_by_modality:
-            return False
 
-        ks = self.keys()
-        return (ks == other.keys()
-                and all(nested_tensors_equal(self[k], other[k]) for k in ks))
+        return self._items_by_modality == other._items_by_modality
 
     def _validate_modality(self, method_name: str, modality: str) -> None:
         if not self._items_by_modality:
@@ -888,6 +881,25 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
         self._validate_modality("get_items", modality)
         return self._items_by_modality[modality]
 
+    def get_data(self,
+                 *,
+                 pin_memory: bool = False) -> Mapping[str, NestedTensors]:
+        if self._data is not None:
+            return self._data
+
+        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
+        for items in self._items_by_modality.values():
+            for item in items:
+                for key, elem in item.items():
+                    elems_by_key[key].append(elem)
+
+        data = {
+            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+            for key, elems in elems_by_key.items() if len(elems) > 0
+        }
+        self._data = data
+        return data
+
 
 MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]]
 """
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 38c5d5d99f63..4684bf6f3d83 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1480,7 +1480,7 @@ def _cached_apply_hf_processor(
             mm_missing_kwargs=mm_missing_kwargs,
         )
 
-        mm_kwargs = MultiModalKwargs.from_items([
+        mm_kwargs = MultiModalKwargs([
             item for cache_items in mm_cache_items_merged.values()
             for item in cache_items
         ])
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index f914d0dc6c5e..a80f09bb1927 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -402,12 +402,14 @@ def group_mm_kwargs_by_modality(
     for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
         items_lst = list(items)
 
-        # mm_kwargs_group = MultiModalKwargs.from_items(items_lst,
-        #                                               pin_memory=pin_memory)
+        # mm_kwargs_group = MultiModalKwargs(items_lst) \
+        #    .get_data(pin_memory=pin_memory)
 
         # if device is not None:
-        #     mm_kwargs_group = json_map_leaves(lambda x: x.to(device=device),
-        #                                       mm_kwargs_group.data)
+        #     mm_kwargs_group = json_map_leaves(
+        #         lambda x: x.to(device=device),
+        #         mm_kwargs_group,
+        #     )
 
         # TODO: Once V0 is removed, we can use the merging logic above
         # to avoid creating an extra batch dimension (except for fields
@@ -415,7 +417,7 @@ def group_mm_kwargs_by_modality(
         # We will also need to update each model to remove `flatten_bn`.
         mm_kwargs_group = MultiModalKwargs.as_kwargs(
             MultiModalKwargs.batch(
-                [MultiModalKwargs.from_items([item]) for item in items_lst],
+                [MultiModalKwargs([item]) for item in items_lst],
                 pin_memory=pin_memory,
             ),
             device=device,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index cbe63f8d1d4e..b3be10b6bb61 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -524,7 +524,7 @@ def multi_modal_data(self) -> MultiModalKwargs:
         if self.inputs["type"] == "multimodal":
             return self.inputs["mm_kwargs"]
 
-        return MultiModalKwargs({})
+        return MultiModalKwargs()
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
@@ -780,7 +780,7 @@ def multi_modal_data(self) -> MultiModalKwargs:
             return self.first_seq.multi_modal_data
         elif self.encoder_seq is not None:
             return self.encoder_seq.multi_modal_data
-        return MultiModalKwargs({})
+        return MultiModalKwargs()
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 3f0fad8a64d0..2857d8ef4290 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -117,16 +117,9 @@ def enc_hook(self, obj: Any) -> Any:
             return self._encode_mm_item(obj)
 
         if isinstance(obj, MultiModalKwargs):
-            mm: MultiModalKwargs = obj
-            if not mm.modalities:
-                # just return the main dict if there are no modalities.
-                return dict(mm)
-
-            # ignore the main dict, it will be re-indexed.
-            # Any tensors *not* indexed by modality will be ignored.
             return [
                 self._encode_mm_item(item)
-                for itemlist in mm._items_by_modality.values()
+                for itemlist in obj._items_by_modality.values()
                 for item in itemlist
             ]
 
@@ -268,13 +261,7 @@ def dec_hook(self, t: type, obj: Any) -> Any:
             if issubclass(t, MultiModalKwargsItem):
                 return self._decode_mm_item(obj)
             if issubclass(t, MultiModalKwargs):
-                if isinstance(obj, list):
-                    return MultiModalKwargs.from_items(
-                        self._decode_mm_items(obj))
-                return MultiModalKwargs({
-                    k: self._decode_nested_tensors(v)
-                    for k, v in obj.items()
-                })
+                return MultiModalKwargs(self._decode_mm_items(obj))
             if t is UtilityResult:
                 return self._decode_utility_result(obj)
         return obj
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e718d9d5e0fb..3d4cf27a6ccf 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -58,7 +58,7 @@ def num_tokens(self) -> int:
     @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
                 "removed in v0.13. Please use `mm_kwargs` instead.")
     def mm_inputs(self) -> list[MultiModalKwargs]:
-        return [MultiModalKwargs.from_items([item]) for item in self.mm_kwargs]
+        return [MultiModalKwargs([item]) for item in self.mm_kwargs]
 
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:

From f11a7fcec2e9bcc8c86dfc89672d6e2cbabae1e8 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sun, 17 Aug 2025 12:49:14 +0800
Subject: [PATCH 145/233] [Misc] method name typo fix (#23042)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/worker/cpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 11b96d946365..a7180afbd64b 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -29,7 +29,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.use_cuda_graph = False
         self.cascade_attn_enabled = False
 
-        self._postprocess_tenosrs()
+        self._postprocess_tensors()
 
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
@@ -59,7 +59,7 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         self.attn_groups[0][0].metadata_builder.reorder_batch(
             self.input_batch, scheduler_output)
 
-    def _postprocess_tenosrs(self) -> None:
+    def _postprocess_tensors(self) -> None:
         # Note: replace device tensors with cpu tensors
         def replace_tensor(obj: Any, cpu_attr_name: str,
                            device_attr_name) -> None:

From 2f80578c17d52fc95d121ee7c5a4c9e2ad54e20a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 17 Aug 2025 13:03:24 +0800
Subject: [PATCH 146/233] [Kernel] Add cuda kernel for gpt_oss activation
 (#22951)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 csrc/activation_kernels.cu                    | 59 +++++++++++++++++++
 csrc/ops.h                                    |  2 +
 csrc/torch_bindings.cpp                       |  6 ++
 tests/kernels/core/test_activation.py         | 45 ++++++++++++--
 vllm/model_executor/layers/activation.py      | 41 ++++++++++++-
 .../layers/fused_moe/fused_marlin_moe.py      | 22 ++-----
 .../layers/fused_moe/fused_moe.py             | 18 ++----
 .../layers/quantization/utils/mxfp4_utils.py  |  4 +-
 vllm/model_executor/models/gpt_oss.py         |  2 +-
 9 files changed, 157 insertions(+), 42 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 55e659679701..a4a880f13cf7 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param(
   }
 }
 
+template <typename T>
+__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
+                                               float alpha, float limit) {
+  // clamp gate: min=None, max=limit
+  const float gate_f = (float)gate;
+  const float clamped_gate = gate_f > limit ? limit : gate_f;
+
+  // clamp up: min=-limit, max=limit
+  const float up_f = (float)up;
+  const float clamped_up =
+      up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
+
+  // glu = gate * sigmoid(gate * alpha)
+  const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
+  const float glu = clamped_gate * sigmoid_val;
+
+  // (up + 1) * glu
+  return (T)((clamped_up + 1.0f) * glu);
+}
+
+template <typename scalar_t,
+          scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
+                             const float)>
+__global__ void swigluoai_and_mul_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d, const float alpha, const float limit) {
+  const int64_t token_idx = blockIdx.x;
+  // TODO: Vectorize loads and stores.
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    // gate = x[..., ::2]  (even indices)
+    const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
+    // up = x[..., 1::2]   (odd indices)
+    const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
+
+    out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
+  }
+}
+
 }  // namespace vllm
 
 #define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM)         \
@@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param(
                                          PARAM);                        \
       });
 
+#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT)                          \
+  int d = input.size(-1) / 2;                                                  \
+  int64_t num_tokens = input.numel() / input.size(-1);                         \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                \
+      input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] {            \
+        vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),             \
+                                         input.data_ptr<scalar_t>(), d, ALPHA, \
+                                         LIMIT);                               \
+      });
+
 void fatrelu_and_mul(torch::Tensor& out,    // [..., d],
                      torch::Tensor& input,  // [..., 2 * d]
                      double threshold) {
   LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
 }
+void swigluoai_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input,  // [..., 2 * d]
+                       double alpha, double limit) {
+  LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
+}
 namespace vllm {
 
 // Element-wise activation kernel template.
diff --git a/csrc/ops.h b/csrc/ops.h
index 6e39758f16a1..64bcec6ca152 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
                      double threshold);
+void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
+                       double alpha = 1.702, double limit = 7.0);
 
 void gelu_new(torch::Tensor& out, torch::Tensor& input);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 5fee106335d3..7079671c2eb1 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -130,6 +130,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
   ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
 
+  ops.def(
+      "swigluoai_and_mul(Tensor! out, Tensor input, float alpha=1.702, float "
+      "limit=7.0) "
+      "-> ()");
+  ops.impl("swigluoai_and_mul", torch::kCUDA, &swigluoai_and_mul);
+
   // GELU implementation used in GPT-2.
   ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
   ops.impl("gelu_new", torch::kCUDA, &gelu_new);
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 29c5e70a8ba8..ec5c60fd7b0e 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -11,7 +11,7 @@
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
                                                    NewGELU, QuickGELU,
-                                                   SiluAndMul)
+                                                   SiluAndMul, SwigluOAIAndMul)
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -25,7 +25,15 @@
 
 @pytest.mark.parametrize(
     "activation",
-    ["silu_and_mul", "mul_and_silu", "gelu", "gelu_tanh", "fatrelu"])
+    [
+        "silu_and_mul",
+        "mul_and_silu",
+        "gelu",
+        "gelu_tanh",
+        "fatrelu",
+        "swigluoai_and_mul",
+    ],
+)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -59,18 +67,43 @@ def test_act_and_mul(
         threshold = random.uniform(0, 1)
         layer = FatreluAndMul(threshold)
         fn = torch.ops._C.fatrelu_and_mul
+    elif activation == "swigluoai_and_mul":
+        layer = SwigluOAIAndMul()
+        fn = torch.ops._C.swigluoai_and_mul
     out = layer(x)
     ref_out = layer.forward_native(x)
-    # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
-    # equivalent to the native PyTorch implementations, so we can do exact
-    # comparison.
-    torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
+    if activation == "swigluoai_and_mul":
+
+        rtol = {
+            #For fp16, change the relative tolerance from 1e-3 to 2e-3
+            torch.float16:
+            2e-3,
+            torch.bfloat16:
+            2e-2,
+            torch.float:
+            1.3e-6
+        }
+
+        def _get_rtol(output) -> float:
+            return rtol[output.dtype]
+
+        torch.testing.assert_close(out,
+                                   ref_out,
+                                   atol=get_default_atol(out),
+                                   rtol=_get_rtol(out))
+    else:
+        # The SiluAndMul, MulAndSilu, GELU and FatReLU implementations are
+        # equivalent to the native PyTorch implementations, so we can do exact
+        # comparison.
+        torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0)
 
     d = x.shape[-1] // 2
     output_shape = (x.shape[:-1] + (d, ))
     out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
     if activation == "fatrelu":
         opcheck(fn, (out, x, threshold))
+    elif activation == "swigluoai_and_mul":
+        opcheck(fn, (out, x, layer.alpha, layer.limit))
     else:
         opcheck(fn, (out, x))
 
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 7ce44174ead6..86ab4f546d12 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -239,6 +239,35 @@ def extra_repr(self) -> str:
         return f'approximate={repr(self.approximate)}'
 
 
+@CustomOp.register("swigluoai_and_mul")
+class SwigluOAIAndMul(CustomOp):
+    # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
+    def __init__(self, alpha: float = 1.702, limit: float = 7.0):
+        super().__init__()
+        self.alpha = alpha
+        self.limit = limit
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+
+        gate, up = x[..., ::2], x[..., 1::2]
+        gate = gate.clamp(min=None, max=self.limit)
+        up = up.clamp(min=-self.limit, max=self.limit)
+        glu = gate * torch.sigmoid(gate * self.alpha)
+        gated_output = (up + 1) * glu
+        return gated_output
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        torch.ops._C.swigluoai_and_mul(out, x, self.alpha, self.limit)
+        return out
+
+    def extra_repr(self) -> str:
+        return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
+
+
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
 
@@ -330,6 +359,7 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         return torch.square(F.relu(x))
 
     def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        #TODO : implement cuda kenrels
         return self.forward_native(x)
 
 
@@ -406,9 +436,14 @@ def get_act_fn(act_fn_name: str) -> nn.Module:
 
 
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
-    "gelu": lambda: GeluAndMul(),
-    "silu": lambda: SiluAndMul(),
-    "geglu": lambda: GeluAndMul(),
+    "gelu":
+    lambda: GeluAndMul(),
+    "silu":
+    lambda: SiluAndMul(),
+    "geglu":
+    lambda: GeluAndMul(),
+    "swigluoai":
+    lambda *args, **kwargs: SwigluOAIAndMul(*args, **kwargs),
 })
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 3c6ece6737e4..1e3ac6cd79f6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -161,25 +161,13 @@ def fused_marlin_moe(hidden_states: torch.Tensor,
     if activation == "silu":
         torch.ops._C.silu_and_mul(intermediate_cache2,
                                   intermediate_cache1.view(-1, 2 * N))
-    elif activation == "swiglu_oai":
-        # NOTE: in gpt-oss, the gate_proj and up_proj is interleaved
-        # - interleaved: gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-        # - origin: gate, up = gate_up[..., :N], gate_up[..., N:]
-
-        @torch.compile(dynamic=True)
-        def swiglu_oai(gate_up):
-            alpha = 1.702
-            limit = 7.0
-            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-            gate = gate.clamp(min=None, max=limit)
-            up = up.clamp(min=-limit, max=limit)
-            glu = gate * torch.sigmoid(gate * alpha)
-            return (up + 1) * glu
-
-        intermediate_cache2 = swiglu_oai(intermediate_cache1)
+    elif activation == "swigluoai":
+        # alpha = 1.702, limit = 7.0
+        torch.ops._C.swigluoai_and_mul(intermediate_cache2,
+                                       intermediate_cache1.view(-1, 2 * N))
     else:
         raise ValueError(f"Unsupported activation: {activation}. "
-                         "Only silu and swiglu_oai activations are supported.")
+                         "Only silu and swigluoai activations are supported.")
 
     if expert_map is not None:
         intermediate_cache3.zero_()
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3579ca22bafc..02b7b65f4a02 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1621,17 +1621,6 @@ def fused_experts_impl(
                                 block_shape=block_shape,
                                 B_bias=w1_bias)
 
-        # TODO fused kernel
-        def swiglu_oai(gate_up):
-            alpha = 1.702
-            limit = 7.0
-            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
-            gate = gate.clamp(min=None, max=limit)
-            up = up.clamp(min=-limit, max=limit)
-            glu = gate * torch.sigmoid(gate * alpha)
-            gated_output = (up + 1) * glu
-            return gated_output
-
         # Activation function with multiplication
         if activation == "silu" and is_act_and_mul:
             torch.ops._C.silu_and_mul(intermediate_cache2,
@@ -1639,13 +1628,16 @@ def swiglu_oai(gate_up):
         elif activation == "gelu" and is_act_and_mul:
             torch.ops._C.gelu_and_mul(intermediate_cache2,
                                       intermediate_cache1.view(-1, N))
+        elif activation == "swigluoai" and is_act_and_mul:
+            # alpha = 1.702, limit = 7.0
+            torch.ops._C.swigluoai_and_mul(intermediate_cache2,
+                                           intermediate_cache1.view(-1, N))
         # Activation function without multiplication
         elif activation == "silu":
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
         elif activation == "gelu":
             intermediate_cache2 = F.gelu(intermediate_cache1.view(-1, N))
-        elif activation == "swiglu_oai":
-            intermediate_cache2 = swiglu_oai(intermediate_cache1.view(-1, N))
+
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}, "
                              f"with is_act_and_mul={is_act_and_mul}.")
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index deeb69bcad0e..48f9cc3737e4 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -61,14 +61,14 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
                        e_score_correction_bias: Optional[torch.Tensor] = None,
                        apply_router_weight_on_input: bool = False,
                        scoring_func: str = "softmax",
-                       activation: str = "swiglu_oai",
+                       activation: str = "swigluoai",
                        expert_load_view: Optional[torch.Tensor] = None,
                        logical_to_physical_map: Optional[torch.Tensor] = None,
                        logical_replica_count: Optional[torch.Tensor] = None):
     return not (use_grouped_topk or topk_group or num_expert_group
                 or expert_map or custom_routing_function
                 or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swiglu_oai"
+                or scoring_func != "softmax" or activation != "swigluoai"
                 or expert_load_view or logical_to_physical_map
                 or logical_replica_count)
 
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 7c7712dbe106..2f5d9ddd9054 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -159,7 +159,7 @@ def __init__(
                                 prefix=f"{prefix}.experts",
                                 apply_router_weight_on_input=False,
                                 has_bias=True,
-                                activation="swiglu_oai")
+                                activation="swigluoai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         t = self.norm(x)

From 7466a820931f9d17d65fa02f7ea295551b74a4cf Mon Sep 17 00:00:00 2001
From: 947132885 <947132885@qq.com>
Date: Sun, 17 Aug 2025 16:46:36 +0800
Subject: [PATCH 147/233] [Bugfix] should use stack instead of concat (#22972)

Signed-off-by: 947132885 <947132885@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/model_executor/models/transformers.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 4ec2b683fc33..f3b7263ca387 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -694,6 +694,17 @@ def compute_logits(
         return logits
 
 
+def flatten_and_concat(x: list[torch.Tensor]) -> torch.Tensor:
+    """Flatten until a list of tensors can be concatenated then do concat"""
+
+    def _can_concat(x: list[torch.Tensor]):
+        return len(set(map(lambda _x: _x.shape[1:], x))) == 1
+
+    if _can_concat(x):
+        return torch.concat(x)
+    return flatten_and_concat(flatten_bn(x))
+
+
 @MULTIMODAL_REGISTRY.register_processor(
     MultiModalProcessor,
     info=MultiModalProcessingInfo,
@@ -766,8 +777,7 @@ def get_multimodal_embeddings(self, **kwargs):
             if isinstance(pixel_values, torch.Tensor):
                 pixel_values = flatten_bn(pixel_values).to(self.dtype)
             elif is_list_of(pixel_values, torch.Tensor):
-                pixel_values = flatten_bn(flatten_bn(pixel_values),
-                                          concat=True).to(self.dtype)
+                pixel_values = flatten_and_concat(pixel_values).to(self.dtype)
             else:
                 raise ValueError(
                     f"Unsupported pixel_values type {type(pixel_values)}. "

From 1b10e1f6c8599cda2f6ba6764b45546850e9d6e3 Mon Sep 17 00:00:00 2001
From: Kevinzz <kevinzz08@foxmail.com>
Date: Sun, 17 Aug 2025 16:56:20 +0800
Subject: [PATCH 148/233] [Misc] fix typo in the multimodal doc (#23051)

---
 docs/features/multimodal_inputs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index cdd32924b566..9d51f9cf52f5 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -216,7 +216,7 @@ Instead of NumPy arrays, you can also pass `'torch.Tensor'` instances, as shown
     from vllm import LLM, SamplingParams
     from qwen_vl_utils import process_vision_info
 
-    model_path = "Qwen/Qwen2.5-VL-3B-Instruct/"
+    model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
     video_path = "https://content.pexels.com/videos/free-videos.mp4"
 
     llm = LLM(

From dd065c48f545a530a393a306a6333c488b406cd3 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 17 Aug 2025 11:52:04 -0400
Subject: [PATCH 149/233] [BugFix] Fix for IMA in FA3 varlen combine (#22967)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 4e2a0e4533e6..49defccbb1fa 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 2d3b7508f67ad976f781e2042ace676419dd78dd
+          GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From bf9e37e5e1f1f95e95f6663b9808169bf56d5cb1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 17 Aug 2025 10:36:46 -0700
Subject: [PATCH 150/233] [Misc] Remove dead return (#23061)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/model_executor/models/qwen2_vl.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index f2d438b3850b..9e2f7ca42b4b 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1225,7 +1225,6 @@ def get_multimodal_embeddings(self,
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return []
-            return None
 
         # The result multimodal_embeddings is tuple of tensors, with each
         # tensor correspoending to a multimodal data item (image or video).

From d969012115af15cfeb27c4ee3d0d5c54987a7e71 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 17 Aug 2025 12:41:38 -0700
Subject: [PATCH 151/233] [Misc] Convert use_structured_output property into
 constant (#23060)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/request.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 8b703b6191fe..4e99a9ccef46 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -54,8 +54,7 @@ def __init__(
             time.time()
 
         self.status = RequestStatus.WAITING
-        if sampling_params and sampling_params.guided_decoding is not None:
-            self.status = RequestStatus.WAITING_FOR_FSM
+        self.use_structured_output = False
         self.events: list[EngineCoreEvent] = []
         self.stop_reason: Union[int, str, None] = None
 
@@ -63,12 +62,15 @@ def __init__(
         self.kv_transfer_params: Optional[dict[str, Any]] = None
 
         if pooling_params is not None:
+            # Pooling models.
             self.max_tokens = 1
         elif sampling_params is not None:
+            # Generative models.
             assert sampling_params.max_tokens is not None
             self.max_tokens = sampling_params.max_tokens
             if sampling_params.guided_decoding is not None:
                 self.status = RequestStatus.WAITING_FOR_FSM
+                self.use_structured_output = True
 
             if sampling_params.extra_args is not None:
                 self.kv_transfer_params = \
@@ -192,11 +194,6 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id].length
         return num_tokens
 
-    @property
-    def use_structured_output(self) -> bool:
-        return self.sampling_params is not None and \
-            self.sampling_params.guided_decoding is not None
-
     def record_event(
         self,
         event_type: EngineCoreEventType,

From 24c5ed4dde941ebade7c6e875fb8fde6c2ff9116 Mon Sep 17 00:00:00 2001
From: Calvin Chen <wen.chen@dynamia.ai>
Date: Mon, 18 Aug 2025 05:45:42 +0800
Subject: [PATCH 152/233] [XPU] fix xpu to set cudagraph batch sizes (#23044)

Signed-off-by: calvin chen <wen.chen@dynamia.ai>
---
 vllm/v1/worker/gpu_model_runner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4219d9147ada..adaa1306f6ca 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -232,8 +232,10 @@ def __init__(
         # The convention is different.
         # self.cudagraph_batch_sizes sorts in ascending order.
         # The batch sizes in the config are in descending order.
-        self.cudagraph_batch_sizes = list(
-            reversed(self.compilation_config.cudagraph_capture_sizes))
+        if self.compilation_config.cudagraph_capture_sizes and \
+                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE:
+            self.cudagraph_batch_sizes = list(
+                reversed(self.compilation_config.cudagraph_capture_sizes))
 
         # Cache the device properties.
         self._init_device_properties()

From 0f94eec5aa1ba1580dcd2be5f9bc04789865faba Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 17 Aug 2025 15:56:07 -0700
Subject: [PATCH 153/233] fix: gptq marlin weight loading failure (#23066)

---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index bd14ab9ef6c6..c5d1e017014f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -56,7 +56,7 @@ def get_moe_quant_method(
             # Dynamic per module/layer rules may override base config
             override_config(cloned_config, prefix=prefix)
 
-        return moe_method_cls(cloned_config)
+        return moe_method_cls(cloned_config, layer.moe_config)
     return None
 
 
From d2add7692075cb3d270205d5a9b314820cc6732b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 17 Aug 2025 18:16:03 -0700
Subject: [PATCH 154/233] [Misc] Minor code cleanup for
 _get_prompt_logprobs_dict (#23064)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index adaa1306f6ca..fc320be1c3bd 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1722,7 +1722,7 @@ def execute_model(
         # Compute prompt logprobs if needed.
         prompt_logprobs_dict = self._get_prompt_logprobs_dict(
             hidden_states[:num_scheduled_tokens],
-            scheduler_output,
+            scheduler_output.num_scheduled_tokens,
         )
 
         # Get the valid generated tokens.
@@ -2064,7 +2064,7 @@ def save_tensorized_model(
     def _get_prompt_logprobs_dict(
         self,
         hidden_states: torch.Tensor,
-        scheduler_output: "SchedulerOutput",
+        num_scheduled_tokens: dict[str, int],
     ) -> dict[str, Optional[LogprobsTensors]]:
         num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
@@ -2077,8 +2077,7 @@ def _get_prompt_logprobs_dict(
         # maintainable loop over optimal performance.
         completed_prefill_reqs = []
         for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
-
-            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_tokens = num_scheduled_tokens[req_id]
 
             # Get metadata for this request.
             request = self.requests[req_id]

From de83befc05098f0f4083765a2d1fbf8a633c09a5 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 18 Aug 2025 13:09:08 +0800
Subject: [PATCH 155/233] [Misc] enhance static type hint (#23059)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/worker/lora_model_runner_mixin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 2fbdee4724e3..84ed46989ea9 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -8,6 +8,7 @@
 from typing import Union
 
 import numpy as np
+import torch
 import torch.nn as nn
 
 from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
@@ -31,7 +32,8 @@ class LoRAModelRunnerMixin:
 
     def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
                         scheduler_config: SchedulerConfig,
-                        lora_config: LoRAConfig, device: str) -> nn.Module:
+                        lora_config: LoRAConfig,
+                        device: torch.device) -> nn.Module:
 
         if not supports_lora(model):
             raise ValueError(

From 6acd3018bc0d56b9f79616928f1c8ec3e37babe3 Mon Sep 17 00:00:00 2001
From: double7 <33449816+DoubleVII@users.noreply.github.com>
Date: Mon, 18 Aug 2025 13:09:11 +0800
Subject: [PATCH 156/233] [Bugfix] fix Qwen2.5-Omni processor output mapping
 (#23058)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: double7 <33449816+DoubleVII@users.noreply.github.com>
Co-authored-by: 杨森 <yangsen.double7@bytedance.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/model_executor/models/qwen2_5_omni_thinker.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index e95295c31885..59411eb7503b 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -88,6 +88,11 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
     video_grid_sizes = video_grid_thw.prod(-1)
 
+    # vllm use `second_per_grid_ts` to compute multimodal rotary embedding
+    video_second_per_grid = hf_inputs.get("video_second_per_grid", None)
+    if video_second_per_grid is not None:
+        hf_inputs["second_per_grid_ts"] = video_second_per_grid
+
     return dict(
         input_audio_features=MultiModalFieldConfig.flat_from_sizes(
             "audio", audio_feature_lengths, dim=1),

From 5812d25e5dab13f39b6acb9ab180af5b969889bc Mon Sep 17 00:00:00 2001
From: Andy Lo <andy@mistral.ai>
Date: Mon, 18 Aug 2025 07:10:26 +0200
Subject: [PATCH 157/233] [Bugfix][CI] Machete kernels: deterministic ordering
 for more cache hits (#23055)

Signed-off-by: Andy Lo <andy@mistral.ai>
---
 csrc/quantization/machete/generate.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 9af7833d09f3..88b3f9c734a3 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -349,9 +349,12 @@ def _to_cute_constant(value: int):
 
 
 def unique_schedules(impl_configs: list[ImplConfig]):
-    return list(
-        set(sch for impl_config in impl_configs
-            for sch in impl_config.schedules))
+    # Use dict over set for deterministic ordering
+    return list({
+        sch: None
+        for impl_config in impl_configs
+        for sch in impl_config.schedules
+    }.keys())
 
 
 def unsigned_type_with_bitwidth(num_bits):

From 46d4061c3ec2359c38c059d58f39429f4f91b0fc Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 18 Aug 2025 13:16:21 +0800
Subject: [PATCH 158/233] [Misc] refactor function name (#23029)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/platforms/cpu.py        | 2 +-
 vllm/v1/worker/cpu_worker.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 0b16a8e1d1d8..fe258f76b9d7 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -268,7 +268,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
     @classmethod
-    def get_allowed_cpu_memory_node_list(
+    def get_allowed_cpu_core_node_list(
             cls) -> tuple[list[int], list[LogicalCPUInfo]]:
         assert platform.system() == "Linux"
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 2dc28d93049a..f83d6804840e 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -132,7 +132,7 @@ def _get_autobind_cpu_ids(
         """
 
         allowed_numa_nodes, logical_cpu_list = \
-            CpuPlatform.get_allowed_cpu_memory_node_list()
+            CpuPlatform.get_allowed_cpu_core_node_list()
         assert len(allowed_numa_nodes) >= self.parallel_config.world_size, (
             f"No enough allowed NUMA nodes to bind threads of "
             f"{self.parallel_config.world_size} CPUWorkers. "

From 96ebb7401b5dff98f8cf065f8f237587d3eb3f82 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sun, 17 Aug 2025 23:33:29 -0700
Subject: [PATCH 159/233] [Misc] Fix backward compatibility from #23030
 (#23070)

Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 vllm/multimodal/base.py   | 9 ++++++---
 vllm/multimodal/inputs.py | 6 +++---
 vllm/sequence.py          | 4 +++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index ef8f1b2e17b4..c4bb8d56ce3e 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -9,7 +9,7 @@
 if TYPE_CHECKING:
     from vllm.sequence import SequenceGroupMetadata
 
-from .inputs import MultiModalKwargs, PlaceholderRange
+from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange
 
 _T = TypeVar("_T")
 
@@ -56,7 +56,8 @@ def __init__(self):
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
+    ) -> tuple[dict[str, NestedTensors], dict[str,
+                                              "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
         prompt (``seq_group``) represented by ``positions``, as well as a
@@ -99,7 +100,7 @@ def from_seq_group(
         seq_mm_placeholders = seq_group.multi_modal_placeholders
 
         if not seq_mm_data or not seq_mm_placeholders:
-            return MultiModalKwargs(), {}
+            return MultiModalKwargs().get_data(), {}
 
         placeholder_maps = dict[str, MultiModalPlaceholderMap]()
 
@@ -116,6 +117,8 @@ def from_seq_group(
 
             placeholder_maps[modality] = placeholder_map
 
+        seq_mm_data = seq_mm_data if isinstance(
+            seq_mm_data, dict) else seq_mm_data.get_data()
         return seq_mm_data, placeholder_maps
 
     def append_items_from_seq_group(
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index d3f57cf5338d..3e0bfce59c5f 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -664,7 +664,7 @@ def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None:
     def modality(self) -> str:
         return self._modality
 
-    def get_data(self) -> Mapping[str, NestedTensors]:
+    def get_data(self) -> dict[str, NestedTensors]:
         return {key: elem.data for key, elem in self.items()}
 
 
@@ -720,7 +720,7 @@ def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None:
         items_by_modality = full_groupby(items, key=lambda x: x.modality)
         self._items_by_modality = dict(items_by_modality)
 
-        self._data: Optional[Mapping[str, NestedTensors]] = None
+        self._data: Optional[dict[str, NestedTensors]] = None
 
     @property
     def modalities(self):
@@ -883,7 +883,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 
     def get_data(self,
                  *,
-                 pin_memory: bool = False) -> Mapping[str, NestedTensors]:
+                 pin_memory: bool = False) -> dict[str, NestedTensors]:
         if self._data is not None:
             return self._data
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index b3be10b6bb61..2cb254381eff 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -22,6 +22,7 @@
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 if TYPE_CHECKING:
+    from vllm.multimodal.inputs import NestedTensors
     from vllm.v1.worker.kv_connector_model_runner_mixin import (
         KVConnectorOutput)
 
@@ -978,7 +979,8 @@ class SequenceGroupMetadata(
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
     token_type_ids: Optional[list[int]] = None
-    multi_modal_data: Optional[MultiModalKwargs] = None
+    multi_modal_data: Optional[Union[MultiModalKwargs,
+                                     dict[str, "NestedTensors"]]] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     encoder_seq_data: Optional[SequenceData] = None
     cross_block_table: Optional[list[int]] = None

From dfbf77b4cc2730ddfc398505c0dbd41e0655d580 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Mon, 18 Aug 2025 15:04:08 +0800
Subject: [PATCH 160/233] [XPU] Fix compile size for xpu (#23069)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/config/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 51db277f65dc..cd2be212c23d 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3548,7 +3548,7 @@ def __post_init__(self):
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             self.compilation_config.custom_ops.append("+rms_norm")
 
-        if current_platform.is_cuda_alike():
+        if current_platform.is_cuda_alike() or current_platform.is_xpu():
             # if cudagraph_mode is not explicitly set by users, set default
             # value
             if self.compilation_config.cudagraph_mode is None:

From b2ca8af71fdc811daf12a19926c4e05129d8ad85 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Mon, 18 Aug 2025 17:47:03 +0800
Subject: [PATCH 161/233] [XPU][CI]add xpu env vars in CI scripts (#22946)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .buildkite/scripts/hardware_ci/run-xpu-test.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index deb61a9bafab..445cd2735c19 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -23,9 +23,13 @@ docker run \
     --device /dev/dri \
     -v /dev/dri/by-path:/dev/dri/by-path \
     --entrypoint="" \
+    -e "HF_TOKEN=${HF_TOKEN}" \
+    -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
     --name "${container_name}" \
     "${image_name}" \
-    sh -c '
+    bash -c '
+    set -e
+    echo $ZE_AFFINITY_MASK
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
@@ -35,8 +39,8 @@ docker run \
     pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
     pytest -v -s v1/structured_output
-    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
+    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
     pytest -v -s v1/test_serial_utils.py
     pytest -v -s v1/test_utils.py
     pytest -v -s v1/test_metrics_reader.py

From 09871a7d46c5089c16ba4a1e6907541fc97aa1df Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 18 Aug 2025 17:52:00 +0800
Subject: [PATCH 162/233] [Refactor] Define MultiModalKwargsItems separate from
 MultiModalKwargs (#23053)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/api/README.md                            |   1 +
 docs/contributing/model/multimodal.md         |   4 +-
 .../multimodal/processing/test_common.py      |  14 +-
 .../multimodal/processing/test_glm4_1v.py     |   3 +-
 .../multimodal/processing/test_h2ovl.py       |   3 +-
 .../multimodal/processing/test_internvl.py    |   3 +-
 .../multimodal/processing/test_llama4.py      |  10 +-
 .../multimodal/processing/test_mllama.py      |   6 +-
 .../multimodal/processing/test_mllama4.py     |  10 +-
 .../multimodal/processing/test_nemotron_vl.py |   3 +-
 .../multimodal/processing/test_qwen2_vl.py    |   3 +-
 tests/models/multimodal/test_tensor_schema.py |   2 +-
 tests/multimodal/test_cache.py                |  11 +-
 tests/v1/test_serial_utils.py                 |  22 ++-
 vllm/executor/msgspec_utils.py                |   9 +-
 vllm/model_executor/models/aria.py            |   4 +-
 vllm/model_executor/models/aya_vision.py      |   4 +-
 vllm/model_executor/models/blip2.py           |   4 +-
 vllm/model_executor/models/chameleon.py       |   4 +-
 vllm/model_executor/models/cohere2_vision.py  |   4 +-
 vllm/model_executor/models/deepseek_vl2.py    |   7 +-
 vllm/model_executor/models/florence2.py       |   4 +-
 vllm/model_executor/models/fuyu.py            |   4 +-
 vllm/model_executor/models/gemma3_mm.py       |   4 +-
 vllm/model_executor/models/gemma3n_mm.py      |   4 +-
 vllm/model_executor/models/glm4_1v.py         |  10 +-
 vllm/model_executor/models/glm4v.py           |   4 +-
 vllm/model_executor/models/granite_speech.py  |   4 +-
 vllm/model_executor/models/h2ovl.py           |  16 +-
 .../models/hyperclovax_vision.py              |  27 +--
 vllm/model_executor/models/idefics3.py        |   4 +-
 vllm/model_executor/models/interns1.py        |  13 +-
 vllm/model_executor/models/internvl.py        |  34 ++--
 vllm/model_executor/models/keye.py            |   7 +-
 vllm/model_executor/models/kimi_vl.py         |   4 +-
 vllm/model_executor/models/llava.py           |   6 +-
 .../model_executor/models/llava_next_video.py |   4 +-
 vllm/model_executor/models/llava_onevision.py |   4 +-
 vllm/model_executor/models/minicpmo.py        |   4 +-
 vllm/model_executor/models/minicpmv.py        |   4 +-
 vllm/model_executor/models/mistral3.py        |   4 +-
 vllm/model_executor/models/mllama.py          |   7 +-
 vllm/model_executor/models/mllama4.py         |  12 +-
 vllm/model_executor/models/molmo.py           |   4 +-
 vllm/model_executor/models/nvlm_d.py          |  13 +-
 vllm/model_executor/models/ovis.py            |   9 +-
 vllm/model_executor/models/paligemma.py       |   4 +-
 vllm/model_executor/models/phi3v.py           |   4 +-
 vllm/model_executor/models/phi4_multimodal.py |   4 +-
 vllm/model_executor/models/phi4mm.py          |   4 +-
 vllm/model_executor/models/pixtral.py         |   7 +-
 .../models/prithvi_geospatial_mae.py          |   7 +-
 .../models/qwen2_5_omni_thinker.py            |  15 +-
 vllm/model_executor/models/qwen2_audio.py     |   7 +-
 vllm/model_executor/models/qwen2_vl.py        |   7 +-
 vllm/model_executor/models/qwen_vl.py         |   4 +-
 vllm/model_executor/models/skyworkr1v.py      |  13 +-
 vllm/model_executor/models/step3_vl.py        |  14 +-
 vllm/model_executor/models/tarsier.py         |   4 +-
 vllm/model_executor/models/transformers.py    |   6 +-
 vllm/model_executor/models/ultravox.py        |   9 +-
 vllm/model_executor/models/voxtral.py         |   7 +-
 vllm/model_executor/models/whisper.py         |   4 +-
 vllm/multimodal/__init__.py                   |   4 +-
 vllm/multimodal/base.py                       |   9 +-
 vllm/multimodal/cache.py                      |  21 ++-
 vllm/multimodal/inputs.py                     | 172 ++++++++----------
 vllm/multimodal/parse.py                      |  11 +-
 vllm/multimodal/processing.py                 |  38 ++--
 vllm/multimodal/profiling.py                  |   4 +-
 vllm/multimodal/utils.py                      |  25 ++-
 vllm/sequence.py                              |   6 +-
 vllm/v1/engine/processor.py                   |   2 +-
 vllm/v1/serial_utils.py                       |  41 ++++-
 vllm/v1/worker/gpu_input_batch.py             |  10 +-
 vllm/v1/worker/gpu_model_runner.py            |   5 +-
 vllm/v1/worker/tpu_model_runner.py            |   5 +-
 77 files changed, 431 insertions(+), 383 deletions(-)

diff --git a/docs/api/README.md b/docs/api/README.md
index 327472df1d52..57142e8f5625 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -77,6 +77,7 @@ Internal data structures.
 - [vllm.multimodal.inputs.MultiModalFieldElem][]
 - [vllm.multimodal.inputs.MultiModalFieldConfig][]
 - [vllm.multimodal.inputs.MultiModalKwargsItem][]
+- [vllm.multimodal.inputs.MultiModalKwargsItems][]
 - [vllm.multimodal.inputs.MultiModalKwargs][]
 - [vllm.multimodal.inputs.MultiModalInputs][]
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 64a48be32645..76d0f067fd45 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -629,7 +629,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
             self,
             mm_items: MultiModalDataItems,
             hf_processor_mm_kwargs: Mapping[str, object],
-            out_mm_kwargs: MultiModalKwargs,
+            out_mm_kwargs: MultiModalKwargsItems,
         ) -> Sequence[PromptUpdate]:
             hf_config = self.info.get_hf_config()
             image_token_id = hf_config.image_token_index
@@ -778,7 +778,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
             self,
             mm_items: MultiModalDataItems,
             hf_processor_mm_kwargs: Mapping[str, object],
-            out_mm_kwargs: MultiModalKwargs,
+            out_mm_kwargs: MultiModalKwargsItems,
         ) -> Sequence[PromptUpdate]:
             hf_config = self.info.get_hf_config()
             bos_token_id = hf_config.bos_token_id
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 906966ddd064..a1744317b394 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -370,10 +370,16 @@ def _assert_inputs_equal(
     if ignore_mm_keys is None:
         ignore_mm_keys = set()
 
-    assert "mm_kwargs" in a and "mm_kwargs" in b, msg
+    a_rest = {k: v for k, v in a.items() if k != "mm_kwargs"}
+    b_rest = {k: v for k, v in b.items() if k != "mm_kwargs"}
+
+    assert a_rest == b_rest, msg
+
+    a_data = a["mm_kwargs"].get_data()
+    b_data = b["mm_kwargs"].get_data()
 
     for key in ignore_mm_keys:
-        a["mm_kwargs"].pop(key, None)
-        b["mm_kwargs"].pop(key, None)
+        a_data.pop(key, None)
+        b_data.pop(key, None)
 
-    assert a == b, msg
+    assert a_data == b_data, msg
diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py
index a6d900ec5d89..a49842e1099c 100644
--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -45,7 +45,8 @@ def test_processor_override(
     video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
     video_tok_count = processed_inputs["prompt_token_ids"].count(
         video_token_id)
-    grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0]
+    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
+    )["video_grid_thw"][0]
 
     assert grid_t == expected_grid_t
     assert video_tok_count == expected_toks_per_frame * grid_t
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 76e4acc67d4d..1adfe21352c4 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -108,7 +108,8 @@ def _run_check(
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
 
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index c3e2841a8f06..e4f25f5ac712 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -68,7 +68,8 @@ def _run_check(
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
 
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py
index 5e14f0f9964d..bea4f43567ee 100644
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -51,14 +51,14 @@ def test_processor_override(
         prompt = encode_tokens(tokenizer, prompt)
 
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
-    mm_kwargs = processed_inputs["mm_kwargs"]
+    mm_data = processed_inputs["mm_kwargs"].get_data()
 
     # place holder replacements
     prompt_token_ids = processed_inputs["prompt_token_ids"]
     assert prompt_token_ids.count(config.boi_token_index) == num_imgs
     assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
     assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
-    aspect_ratios = mm_kwargs["aspect_ratios"]
+    aspect_ratios = mm_data["aspect_ratios"]
     num_x_separators = num_y_separators = 0
     for tiles_y, tiles_x in aspect_ratios:
         if tiles_x * tiles_y > 1:
@@ -80,6 +80,6 @@ def test_processor_override(
     num_patches_per_chunk = processor.info.get_patch_per_chunk(
         config.vision_config)
     assert prompt_token_ids.count(config.image_token_index) \
-        == mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
-    assert mm_kwargs["pixel_values"].shape[0] \
-        == mm_kwargs["patches_per_image"].sum()
+        == sum(mm_data["patches_per_image"]) * num_patches_per_chunk
+    assert len(mm_data["pixel_values"]) \
+        == sum(mm_data["patches_per_image"])
diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
index a6b20a1e3678..b42d3f89f3cb 100644
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -49,18 +49,18 @@ def test_profiling(
     encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
                         ] * max_num_seqs
 
-    mm_kwargs = processor.apply(
+    mm_data = processor.apply(
         prompt=dummy_mm_data.prompt,
         mm_data=dummy_mm_data.mm_data,
         hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"]
+    )["mm_kwargs"].get_data()
 
     # Get the actual number of encoder tokens for each sample.
     # Because attn_metadata.encoder_seq_lens only counts the last
     # group of images for each sample, which is used to cheat the
     # block manager to allocate blocks for those images only.
     # See MllamaMultiModalProcessor for more details.
-    num_tiles = [[t] for t in mm_kwargs.pop("num_tiles")]
+    num_tiles = [[t] for t in mm_data.pop("num_tiles")]
     num_tokens_per_tile = calc_token_per_chunk(image_size)
     actual_encoder_seq_lens = [
         sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py
index f3871b60c3f6..3be77b5da63f 100644
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -38,21 +38,21 @@ def test_profiling(model_id: str, max_model_len: int):
 
     hf_config = ctx.get_hf_config(Llama4Config)
 
-    mm_kwargs = processor.apply(
+    mm_data = processor.apply(
         prompt=dummy_mm_data.prompt,
         mm_data=dummy_mm_data.mm_data,
         hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"]
+    )["mm_kwargs"].get_data()
 
     image_size = hf_config.vision_config.image_size
     patch_size = hf_config.vision_config.patch_size
     downsample_ratio = int(
         round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
     tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
-    chunks_per_image = prod(mm_kwargs["patches_per_image"])
+    chunks_per_image = prod(mm_data["patches_per_image"])
     total_num_patches = chunks_per_image * tokens_per_patch
-    num_tiles = mm_kwargs["aspect_ratios"][0][0] * mm_kwargs["aspect_ratios"][
-        0][1]  # x-y seperator tokens
+    num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
+        1]  # x-y seperator tokens
     total_tokens = total_num_patches.item() + num_tiles.item(
     ) + 3  # image start, image, image end
 
diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py
index 6fbbab0d2612..d9f1965a053d 100644
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -70,7 +70,8 @@ def _run_check(
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<image>")
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values_flat"].shape
     print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
     assert img_tok_count == 256 * total_expected_num_patches
     assert pixel_shape[0] == total_expected_num_patches
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 9d1cd183387b..985f4188fdb6 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -48,7 +48,8 @@ def test_processor_override(
     hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data(
+    )["pixel_values"].shape
 
     assert img_tok_count == expected_toks_per_img * num_imgs
     assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index 036624431c20..51e5b84b6c08 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -128,7 +128,7 @@ def create_batched_mm_kwargs(
     )["mm_kwargs"]
     items = [
         item for modality in supported_mm_limits
-        for item in mm_kwargs.get_items(modality)
+        for item in mm_kwargs[modality]
     ]
     return group_mm_kwargs_by_modality(items)
 
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 2149f05b6af0..088cd00db2e0 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -4,8 +4,8 @@
 import torch
 
 from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
-                                    MultiModalKwargsItem,
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                     MultiModalSharedField)
 
 
@@ -24,8 +24,8 @@ def _dummy_item(modality: str, size_by_key: dict[str, int]):
     ])
 
 
-def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
-    return MultiModalKwargs([
+def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargsItems.from_seq([
         _dummy_item(modality, size_by_key)
         for modality, size_by_key in size_by_key_modality.items()
     ])
@@ -37,7 +37,8 @@ def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
     [
         (_dummy_item("a", {"a1": 100}), 100),
         (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
-        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+        (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+        (_dummy_items({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}).get_data(), 460),  # noqa: E501
     ],
 )
 # yapf: enable
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index 586276ee08ae..118b40d0ef41 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -11,7 +11,8 @@
 
 from vllm.multimodal.inputs import (MultiModalBatchedField,
                                     MultiModalFieldElem, MultiModalFlatField,
-                                    MultiModalKwargs, MultiModalKwargsItem,
+                                    MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                     MultiModalSharedField, NestedTensors)
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
@@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
 
 
 class MyRequest(msgspec.Struct):
-    mm: Optional[list[MultiModalKwargs]]
+    mm: Optional[list[MultiModalKwargsItems]]
 
 
 def test_multimodal_kwargs():
@@ -119,7 +120,7 @@ def test_multimodal_kwargs():
     audio = MultiModalKwargsItem.from_elems([e1])
     video = MultiModalKwargsItem.from_elems([e2])
     image = MultiModalKwargsItem.from_elems([e3, e4])
-    mm = MultiModalKwargs([audio, video, image])
+    mm = MultiModalKwargsItems.from_seq([audio, video, image])
 
     # pack mm kwargs into a mock request so that it can be decoded properly
     req = MyRequest([mm])
@@ -133,19 +134,22 @@ def test_multimodal_kwargs():
 
     total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
 
-    # expected total encoding length, should be 14255, +-20 for minor changes
-    assert 14250 <= total_len <= 14300
-    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+    # expected total encoding length, should be 14306, +-20 for minor changes
+    assert 14275 <= total_len <= 14325
+    decoded = decoder.decode(encoded).mm[0]
+    assert isinstance(decoded, MultiModalKwargsItems)
 
     # check all modalities were recovered and do some basic sanity checks
-    assert len(decoded.modalities) == 3
-    images = decoded.get_items("image")
+    assert len(decoded) == 3
+    images = decoded["image"]
     assert len(images) == 1
     assert len(images[0].items()) == 2
     assert list(images[0].keys()) == ["i0", "i1"]
 
     # check the tensor contents and layout in the main dict
-    assert all(nested_equal(mm[k], decoded[k]) for k in mm)
+    mm_data = mm.get_data()
+    decoded_data = decoded.get_data()
+    assert all(nested_equal(mm_data[k], decoded_data[k]) for k in mm_data)
 
 
 def nested_equal(a: NestedTensors, b: NestedTensors):
diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py
index 852c8f5cffa0..4ce6d8dfad2c 100644
--- a/vllm/executor/msgspec_utils.py
+++ b/vllm/executor/msgspec_utils.py
@@ -4,11 +4,12 @@
 from array import array
 from typing import Any, Type
 
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE
 
 
 def encode_hook(obj: Any) -> Any:
-    """Custom msgspec enc hook that supports array types.
+    """Custom msgspec enc hook that supports array types and MultiModalKwargs.
 
     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
     """
@@ -17,10 +18,12 @@ def encode_hook(obj: Any) -> Any:
             f"vLLM array type should use '{VLLM_TOKEN_ID_ARRAY_TYPE}' type. "
             f"Given array has a type code of {obj.typecode}.")
         return obj.tobytes()
+    if isinstance(obj, MultiModalKwargs):
+        return dict(obj)
 
 
 def decode_hook(type: Type, obj: Any) -> Any:
-    """Custom msgspec dec hook that supports array types.
+    """Custom msgspec dec hook that supports array types and MultiModalKwargs.
 
     See https://jcristharif.com/msgspec/api.html#msgspec.msgpack.Encoder
     """
@@ -28,3 +31,5 @@ def decode_hook(type: Type, obj: Any) -> Any:
         deserialized = array(VLLM_TOKEN_ID_ARRAY_TYPE)
         deserialized.frombytes(obj)
         return deserialized
+    if type is MultiModalKwargs:
+        return MultiModalKwargs(obj)
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index e1368a3f6478..1c7960fa3e0a 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -22,7 +22,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -470,7 +470,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 5cd74bbba482..b02a973d942c 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -18,7 +18,7 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -242,7 +242,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 8e3505f872eb..2f2b880bb0e1 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -15,7 +15,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
@@ -492,7 +492,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 8d705f40ce8f..e6914ad4c495 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -31,7 +31,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -151,7 +151,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index f17583768f79..bc526fd661b6 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -21,7 +21,7 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -241,7 +241,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index e0acca75d9dd..e881e9c6ddb6 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -21,7 +21,7 @@
 from vllm.model_executor.models.transformers import replace_linear_class
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -252,7 +252,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
@@ -291,7 +291,8 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 56e456c2f1f2..4a8cb35a54dc 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -21,7 +21,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
@@ -860,7 +860,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         pad_token_id = hf_config.pad_token_id
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index b61e0361fe8c..90af859ab92e 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -32,7 +32,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -226,7 +226,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         bos_token_id = hf_config.bos_token_id
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 9871b11b3799..bf5ad633b94a 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 # yapf: disable
@@ -311,7 +311,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.boi_token
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index a0c3bb50070b..79061fd30c39 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 # yapf: disable
@@ -209,7 +209,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 88c53c836327..015577322ffe 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -59,7 +59,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, VideoItem)
+                                    MultiModalKwargsItems, VideoItem)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -1158,7 +1158,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
@@ -1175,14 +1175,16 @@ def _get_prompt_updates(
         merge_length = image_processor.merge_size**2
 
         def get_image_replacement_glm4v(item_idx: int):
-            grid_thw = out_mm_kwargs["image_grid_thw"][item_idx]
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid_thw = out_item["image_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
 
             num_tokens = int(grid_thw.prod()) // merge_length
             return [hf_processor.image_token_id] * num_tokens
 
         def get_video_replacement_glm4v(item_idx: int):
-            grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+            out_item = out_mm_kwargs["video"][item_idx]
+            grid_thw = out_item["video_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
 
             video, metadata = mm_items["video"][item_idx]
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 1751fccd08b0..bf33575859ae 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -30,7 +30,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -503,7 +503,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
 
diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py
index c9e3b74e7c3c..c3ac3bb78c83 100644
--- a/vllm/model_executor/models/granite_speech.py
+++ b/vllm/model_executor/models/granite_speech.py
@@ -40,7 +40,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -118,7 +118,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> list[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index c3e4f81597ad..9ab3f4d0d9a1 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -17,7 +17,7 @@
 
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
@@ -425,18 +425,19 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
@@ -479,7 +480,8 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index e5c94c7f3a70..d3ddc47ea932 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -34,7 +34,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
@@ -295,7 +295,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         placeholder = {
@@ -306,21 +306,22 @@ def _get_prompt_updates(
         def get_replacement_hyperclovax(
             item_idx: int,
             modality: str,
-            out_mm_kwargs: MultiModalKwargs,
+            out_mm_kwargs: MultiModalKwargsItems,
         ):
-            num_tokens = None
+            out_item = out_mm_kwargs[modality][item_idx]
+
             if modality == "image":
+                lens = out_item["vision_query_lengths_images"].data
                 num_tokens = self.info.get_num_image_tokens(
-                    vision_query_length=out_mm_kwargs[
-                        "vision_query_lengths_images"][item_idx], )
-            if modality == "video":
+                    vision_query_length=lens)
+            elif modality == "video":
+                lens = out_item["vision_query_lengths_videos"].data
                 num_tokens = self.info.get_num_video_tokens(
-                    vision_query_length=out_mm_kwargs[
-                        "vision_query_lengths_videos"][item_idx], )
-            assert isinstance(num_tokens, int)
-            return [
-                placeholder[modality],
-            ] * num_tokens
+                    vision_query_length=lens)
+            else:
+                raise NotImplementedError(modality)
+
+            return [placeholder[modality]] * num_tokens
 
         return [
             PromptReplacement(
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 3c01789b9006..63307470d959 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -34,7 +34,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageProcessorItems, ImageSize
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -374,7 +374,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token, _, _ = self.info._get_image_token(hf_processor)
diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py
index d952ced2fa69..c739e74b058f 100644
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -399,7 +399,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         img_context_token = hf_processor.image_token
@@ -407,15 +407,16 @@ def _get_prompt_updates(
         end_image_token = hf_processor.end_image_token
         video_token = hf_processor.video_token
 
-        if "video_num_patches" in out_mm_kwargs:
-            video_num_patches = out_mm_kwargs["video_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "video_num_patches" in out_mm_data:
+            video_num_patches = out_mm_data["video_num_patches"]
             assert isinstance(video_num_patches, torch.Tensor)
             video_num_patches = video_num_patches.tolist()
         else:
             video_num_patches = []
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
         else:
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 8e766dd4c476..da8ad8396725 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -28,7 +28,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -797,18 +797,19 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
@@ -966,15 +967,19 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
-        prompt_repl: list[PromptUpdate] = super()._get_prompt_updates(
-            mm_items, hf_processor_mm_kwargs, out_mm_kwargs)
+        prompt_repl = super()._get_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
+        )
 
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "video_num_patches" in out_mm_kwargs:
-            video_num_patches = out_mm_kwargs["video_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "video_num_patches" in out_mm_data:
+            video_num_patches = out_mm_data["video_num_patches"]
             assert isinstance(video_num_patches, torch.Tensor)
             video_num_patches = video_num_patches.tolist()
         else:
@@ -992,12 +997,15 @@ def get_video_replacement_internvl(item_idx: int):
                 video_context_token=hf_processor.video_token)
 
         if self.info.supports_video:
-            prompt_repl.append(
+            prompt_repl = [
+                *prompt_repl,
                 PromptReplacement(
                     modality="video",
                     target="<video>",
                     replacement=get_video_replacement_internvl,
-                ))
+                )
+            ]
+
         return prompt_repl
 
 
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index db9ed5910d78..c6dbd62b905e 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -33,7 +33,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, VideoItem)
+                                    MultiModalKwargsItems, VideoItem)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
@@ -1192,7 +1192,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
@@ -1208,7 +1208,8 @@ def _get_prompt_updates(
         merge_length = image_processor.merge_size**2
 
         def get_replacement_keye(item_idx: int, modality: str):
-            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
 
             num_tokens = int(grid_thw.prod()) // merge_length
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 1c7ddd7df7f8..cbf0008a884b 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -69,7 +69,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -239,7 +239,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         image_token_id = self.info.image_token_id
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 4927d6b62c6d..3caaaa9f7d1e 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -23,7 +23,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs)
+                                    MultiModalInputs, MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -250,7 +250,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -343,7 +343,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         hf_config = self.info.get_hf_config()
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index abc519edadcc..cf9852de633f 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -16,7 +16,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -185,7 +185,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index ecd24af030a1..babd72a4b782 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -18,7 +18,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
@@ -372,7 +372,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         image_repls = super()._get_prompt_updates(
             mm_items=mm_items,
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index e1746695bd5d..98ea366d3a6e 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -40,7 +40,7 @@
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     NestedTensors)
 from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
@@ -316,7 +316,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         base_updates = super()._get_prompt_updates(
             mm_items=mm_items,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 47ce771d8c90..48ce1b9d38e2 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -48,7 +48,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     NestedTensors)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
@@ -694,7 +694,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         placeholders = [("image", self.info.image_pattern),
                         ("video", self.info.video_pattern)]
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 9e29a96c6e44..a647292d3a68 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -23,7 +23,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -265,7 +265,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         hf_config = self.info.get_hf_config()
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 30ae3f26c8e4..9d2ac771474e 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -56,7 +56,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                                    MultiModalFieldConfig, MultiModalKwargs)
+                                    MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
@@ -217,7 +218,7 @@ def apply(
             # Set encoder prompt length based on the number of tiles.
             # This tells the block manager to allocate correct number
             # of slots for encoder tokens.
-            num_tiles = mm_inputs["mm_kwargs"]["num_tiles"]
+            num_tiles = mm_inputs["mm_kwargs"].get_data()["num_tiles"]
             decode_tiles = num_tiles[num_encode_images:num_images].sum().item()
             num_tokens = decode_tiles * token_per_chunk
             mm_inputs["encoder_prompt_token_ids"] = [image_token_id
@@ -302,7 +303,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         token_per_chunk = self.info.get_token_per_chunk_from_config()
         image_token_id = self.info.get_hf_config().image_token_index
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index b405dfca6d39..35103eac8fb5 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -44,7 +44,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -646,13 +646,8 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> list[PromptUpdate]:
-        assert (
-            mm_items.get_count("image", strict=False) == 0
-            or "aspect_ratios" in out_mm_kwargs
-        ), "Transformers expect to include aspect_ratios in out_mm_kwargs"
-
         config = self.info.get_hf_config()
         vision_config = config.vision_config
 
@@ -662,7 +657,8 @@ def _get_prompt_updates(
         img_patch_token = hf_processor.img_patch_token
 
         def get_replacement(item_idx: int):
-            aspect_ratio = out_mm_kwargs["aspect_ratios"][item_idx]
+            out_item = out_mm_kwargs["image"][item_idx]
+            aspect_ratio = out_item["aspect_ratios"].data
 
             repl = hf_processor._prompt_split_image(
                 aspect_ratio=aspect_ratio,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 78dc0dca957f..6a08d2793fd0 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -1282,7 +1282,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 4bea1392a681..3bbf4c67604c 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -16,7 +16,7 @@
 
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
@@ -106,18 +106,19 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py
index 6b27980e0b0c..5b3ad7cbd07a 100644
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement)
@@ -375,11 +375,12 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> list[PromptReplacement]:
 
-        def get_replacement_ovis(item_idx):
-            grid = out_mm_kwargs["grids"][item_idx]
+        def get_replacement_ovis(item_idx: int):
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid = out_item["grids"].data
 
             hf_processor = self.info.get_hf_processor()
             return hf_processor.construct_image_placeholders(grid)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index b1f2e53b0c71..f15e7a17d5d4 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -12,7 +12,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs)
+                                    MultiModalInputs, MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -146,7 +146,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 9ef4f8371eb3..078251ee2bf4 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,7 +32,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 # yapf conflicts with isort for this block
@@ -410,7 +410,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py
index e13b8276bf17..ee8b71caf336 100644
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@@ -30,7 +30,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
                                    ImageProcessorItems, ImageSize,
                                    MultiModalDataItems, MultiModalDataParser)
@@ -1029,7 +1029,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
         image_token_id = tokenizer.vocab[tokenizer.image_token]
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 73e8446e6dea..391117f07525 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -21,7 +21,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
                                    ImageProcessorItems, ImageSize,
                                    MultiModalDataItems, MultiModalDataParser)
@@ -802,7 +802,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 41eaf372785e..5427e9a5935c 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
@@ -273,7 +273,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
@@ -309,7 +309,8 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         (
             prompt_ids,
             mm_kwargs,
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 68488829071f..442596a6b555 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -34,7 +34,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalFieldElem, MultiModalInputs,
-                                    MultiModalKwargs, MultiModalKwargsItem,
+                                    MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                     MultiModalSharedField, PlaceholderRange)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -88,7 +89,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         return []
 
@@ -136,7 +137,7 @@ def apply(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=[1],
-            mm_kwargs=MultiModalKwargs(multimodal_kwargs_items),
+            mm_kwargs=MultiModalKwargsItems.from_seq(multimodal_kwargs_items),
             mm_hashes=None,
             mm_placeholders=mm_placeholders,
         )
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 59411eb7503b..d43573ea2752 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -54,7 +54,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
@@ -265,7 +265,7 @@ def _maybe_apply_prompt_updates(
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         prompt_ids: list[int],
-        mm_kwargs: MultiModalKwargs,
+        mm_kwargs: MultiModalKwargsItems,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         """
@@ -325,7 +325,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
@@ -340,8 +340,9 @@ def _get_prompt_updates(
         image_token_id = vocab[image_token]
         video_token_id = vocab[video_token]
 
-        audio_feature_lengths = out_mm_kwargs.get("audio_feature_lengths")
-        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        out_mm_data = out_mm_kwargs.get_data()
+        audio_feature_lengths = out_mm_data.get("audio_feature_lengths")
+        feature_attention_mask = out_mm_data.get("feature_attention_mask")
         if audio_feature_lengths is None and feature_attention_mask is None:
             audio_output_lengths = []
         elif audio_feature_lengths is not None:
@@ -371,7 +372,7 @@ def get_replacement_qwen2_audio(item_idx: int):
             return [audio_token_id] * num_features
 
         def get_replacement_qwen2_vision(item_idx: int, modality: str):
-            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            grid_thw = out_mm_data[f"{modality}_grid_thw"][item_idx]
             assert isinstance(grid_thw, torch.Tensor)
             merge_length = image_processor.merge_size**2
 
@@ -387,7 +388,7 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int):
 
             audio_num_features = audio_output_lengths[audio_in_video_item_idx +
                                                       item_idx]
-            video_grid_thw = out_mm_kwargs["video_grid_thw"][item_idx]
+            video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 
             audio_in_video_item_idx += 1
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3ef55cd704cf..86c567ca3617 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -37,7 +37,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -182,7 +182,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
@@ -199,7 +199,8 @@ def _get_prompt_updates(
         audio_bos_id = vocab[audio_bos_token]
         audio_eos_id = vocab[audio_eos_token]
 
-        feature_attention_mask = out_mm_kwargs.get("feature_attention_mask")
+        out_mm_data = out_mm_kwargs.get_data()
+        feature_attention_mask = out_mm_data.get("feature_attention_mask")
         if feature_attention_mask is None:
             audio_output_lengths = []
         else:
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 9e2f7ca42b4b..3361878a20d8 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -58,7 +58,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, VideoItem)
+                                    MultiModalKwargsItems, VideoItem)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
@@ -975,7 +975,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
@@ -991,7 +991,8 @@ def _get_prompt_updates(
         merge_length = image_processor.merge_size**2
 
         def get_replacement_qwen2vl(item_idx: int, modality: str):
-            grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx]
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
             assert isinstance(grid_thw, torch.Tensor)
 
             num_tokens = int(grid_thw.prod()) // merge_length
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 4c3fd6b5156d..2950ca664a98 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -627,7 +627,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
         special_tokens: dict[str,
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index c76aabcd27cc..920f4def6917 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -26,7 +26,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -552,18 +552,19 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
-        if "image_num_patches" in out_mm_kwargs:
-            image_num_patches = out_mm_kwargs["image_num_patches"]
+        out_mm_data = out_mm_kwargs.get_data()
+        if "image_num_patches" in out_mm_data:
+            image_num_patches = out_mm_data["image_num_patches"]
             assert isinstance(image_num_patches, torch.Tensor)
             image_num_patches = image_num_patches.tolist()
-        elif "image_embeds" in out_mm_kwargs:
+        elif "image_embeds" in out_mm_data:
             # TODO: Use image size information in dictionary embedding inputs
             # to compute num_patches (similar to Qwen2-VL)
-            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+            image_num_patches = [None] * len(out_mm_data["image_embeds"])
         else:
             image_num_patches = []
 
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index f1f38c01b784..5d41a9e569f5 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -28,7 +28,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -520,20 +520,18 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_placeholder_token_id = hf_processor.image_token_id
-        batch_num_patches = out_mm_kwargs["num_patches"].tolist()
 
         def get_replacement_step1o(item_idx: int):
-            img_out = out_mm_kwargs.get_item("image", item_idx)
-            num_patches = batch_num_patches[item_idx]
+            out_item = out_mm_kwargs["image"][item_idx]
+            num_patches = int(out_item["num_patches"].data)
             if num_patches > 0:
-                patch_newline_mask = img_out["patch_newline_mask"].data.tolist(
-                )
+                patch_newline_mask = out_item["patch_newline_mask"].data
                 image_repl_ids = hf_processor._get_image_repl_features(
-                    1, num_patches, patch_newline_mask)[1]
+                    1, num_patches, patch_newline_mask.tolist())[1]
             else:
                 image_repl_ids = hf_processor._get_image_repl_features(
                     1, 0, None)[1]
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index c8709d866b1e..0990be8d02b9 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -25,7 +25,7 @@
 from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -275,7 +275,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index  # The <IMAGE> token ID
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index f3b7263ca387..712667b1e274 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -41,7 +41,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, PlaceholderRange)
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
@@ -237,7 +237,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ):
         """
         Given the original multi-modal items for this modality
@@ -372,7 +372,7 @@ def apply(
             mm_tokens_per_modality["num_image_patches"]
         ) if "num_image_patches" in mm_tokens_per_modality else None
         processed_data['num_image_patches'] = num_image_patches
-        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
             processed_data,
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                        num_image_patches),
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index bef34c1be49f..f91c4ddb6e83 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -23,7 +23,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -194,7 +194,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
@@ -203,7 +203,8 @@ def _get_prompt_updates(
         # Each audio can be split into multiple chunks.
         # chunks_start_idx[i] indicates the start index of the chunks
         # belonging to the i-th audio.
-        num_chunks = out_mm_kwargs.get("audio_num_chunks", torch.zeros(0))
+        out_mm_data = out_mm_kwargs.get_data()
+        num_chunks = out_mm_data.get("audio_num_chunks", torch.zeros(0))
         chunks_start_idx: torch.Tensor = torch.cumsum(num_chunks,
                                                       dim=0,
                                                       dtype=torch.int32)
@@ -213,7 +214,7 @@ def _get_prompt_updates(
         def get_replacement_ultravox(item_idx: int):
             start = chunks_start_idx[item_idx]
             end = chunks_start_idx[item_idx + 1]
-            audio_token_len = out_mm_kwargs["audio_token_len"][start:end].sum()
+            audio_token_len = out_mm_data["audio_token_len"][start:end].sum()
             return [replacement_id] * int(audio_token_len)  # type: ignore
 
         return [
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 6b06c0ac6683..70ba561642a2 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -31,7 +31,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs, NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -259,7 +259,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
@@ -289,7 +289,8 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         prompt_ids, mm_kwargs, mm_hashes, _ = super(
         )._cached_apply_hf_processor(
             prompt=prompt,
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index ca02ecd828ba..16bbe2f2010a 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalKwargs)
+                                    MultiModalKwargsItems)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
@@ -728,7 +728,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         num_tokens = self.info.get_num_audio_tokens()
         return [
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 2ef9f1ccc02b..69eed2274144 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -4,7 +4,8 @@
 from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalKwargs,
-                     MultiModalPlaceholderDict, NestedTensors)
+                     MultiModalKwargsItems, MultiModalPlaceholderDict,
+                     NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -25,6 +26,7 @@
     "MultiModalHashDict",
     "MultiModalHasher",
     "MultiModalKwargs",
+    "MultiModalKwargsItems",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
     "NestedTensors",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index c4bb8d56ce3e..ef8f1b2e17b4 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -9,7 +9,7 @@
 if TYPE_CHECKING:
     from vllm.sequence import SequenceGroupMetadata
 
-from .inputs import MultiModalKwargs, NestedTensors, PlaceholderRange
+from .inputs import MultiModalKwargs, PlaceholderRange
 
 _T = TypeVar("_T")
 
@@ -56,8 +56,7 @@ def __init__(self):
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> tuple[dict[str, NestedTensors], dict[str,
-                                              "MultiModalPlaceholderMap"]]:
+    ) -> tuple[MultiModalKwargs, dict[str, "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
         prompt (``seq_group``) represented by ``positions``, as well as a
@@ -100,7 +99,7 @@ def from_seq_group(
         seq_mm_placeholders = seq_group.multi_modal_placeholders
 
         if not seq_mm_data or not seq_mm_placeholders:
-            return MultiModalKwargs().get_data(), {}
+            return MultiModalKwargs(), {}
 
         placeholder_maps = dict[str, MultiModalPlaceholderMap]()
 
@@ -117,8 +116,6 @@ def from_seq_group(
 
             placeholder_maps[modality] = placeholder_map
 
-        seq_mm_data = seq_mm_data if isinstance(
-            seq_mm_data, dict) else seq_mm_data.get_data()
         return seq_mm_data, placeholder_maps
 
     def append_items_from_seq_group(
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 8c4136e06f81..5cec8e71fb26 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -11,7 +11,9 @@
 from vllm.utils import GiB_bytes, LRUCache
 from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
 
-from .inputs import MultiModalKwargs, MultiModalKwargsItem, NestedTensors
+from .inputs import (MultiModalFieldElem, MultiModalKwargs,
+                     MultiModalKwargsItem, MultiModalKwargsItems,
+                     NestedTensors)
 
 logger = init_logger(__name__)
 
@@ -26,8 +28,9 @@ def wraps(cls, value: "MultiModalCacheValue"):
 
 
 MultiModalCacheValue = Union[
-    MultiModalKwargs,
+    MultiModalKwargsItems,
     MultiModalKwargsItem,
+    MultiModalKwargs,
     Mapping[str, NestedTensors],
     MultiModalCacheItemMetadata,
 ]
@@ -44,14 +47,16 @@ def get_leaf_size(
         *,
         debug: bool = False,
     ) -> int:
-        # MultiModalKwargs is not a subclass of dict
-        if isinstance(leaf, MultiModalKwargs):
-            return cls.get_item_size(leaf.get_data(), debug=debug)
+        if isinstance(leaf, MultiModalFieldElem):
+            return cls.get_item_size(leaf.data)  # type: ignore
 
-        # MultiModalKwargsItem is not a subclass of dict
+        # These are not subclasses of dict
+        if isinstance(leaf, MultiModalKwargsItems):
+            return cls.get_item_size(leaf.data)  # type: ignore
         if isinstance(leaf, MultiModalKwargsItem):
-            leaf_data = {k: v.data for k, v in leaf.items()}
-            return cls.get_item_size(leaf_data, debug=debug)
+            return cls.get_item_size(leaf.data)  # type: ignore
+        if isinstance(leaf, MultiModalKwargs):
+            return cls.get_item_size(leaf.data)  # type: ignore
 
         # sys.getsizeof doesn't work for tensors
         if isinstance(leaf, torch.Tensor):
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 3e0bfce59c5f..d46d81fe1448 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -11,7 +11,7 @@
                     Union, cast, final)
 
 import numpy as np
-from typing_extensions import NotRequired, TypeAlias
+from typing_extensions import NotRequired, TypeAlias, deprecated
 
 from vllm.utils import LazyLoader, full_groupby, is_list_of
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -656,7 +656,7 @@ def from_elems(elems: Sequence[MultiModalFieldElem]):
     def __init__(self, data: Mapping[str, MultiModalFieldElem] = {}) -> None:
         super().__init__(data)
 
-        modalities = {elem.modality for elem in self.data.values()}
+        modalities = {elem.modality for elem in self.values()}
         assert len(modalities) == 1, f"Found different modalities={modalities}"
         self._modality = next(iter(modalities))
 
@@ -668,16 +668,11 @@ def get_data(self) -> dict[str, NestedTensors]:
         return {key: elem.data for key, elem in self.items()}
 
 
-class MultiModalKwargs:
+class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
     """
-    A dictionary that represents the keyword arguments to
-    [`torch.nn.Module.forward`][].
-
-    The metadata `items` enables us to obtain the keyword arguments
-    corresponding to each data item in
-    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
-    [`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
-    [`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
+    A dictionary of
+    [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s
+    by modality.
     """
 
     @staticmethod
@@ -712,19 +707,64 @@ def from_hf_inputs(
                 elems = [v[item_idx] for v in elems_in_modality.values()]
                 items.append(MultiModalKwargsItem.from_elems(elems))
 
-        return MultiModalKwargs(items)
-
-    def __init__(self, items: Sequence[MultiModalKwargsItem] = ()) -> None:
-        super().__init__()
+        return MultiModalKwargsItems.from_seq(items)
 
+    @staticmethod
+    def from_seq(items: Sequence[MultiModalKwargsItem]):
         items_by_modality = full_groupby(items, key=lambda x: x.modality)
-        self._items_by_modality = dict(items_by_modality)
+        return MultiModalKwargsItems(items_by_modality)
 
-        self._data: Optional[dict[str, NestedTensors]] = None
+    def __getitem__(self, modality: str):
+        if modality not in self:
+            raise KeyError(f"Modality {modality!r} not found. "
+                           f"Available modalities: {set(self.keys())}")
+
+        return super().__getitem__(modality)
+
+    def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
+        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
+        for items in self.values():
+            for item in items:
+                for key, elem in item.items():
+                    elems_by_key[key].append(elem)
+
+        return MultiModalKwargs({
+            key:
+            elems[0].field.reduce_data(elems, pin_memory=pin_memory)
+            for key, elems in elems_by_key.items() if len(elems) > 0
+        })
 
-    @property
-    def modalities(self):
-        return self._items_by_modality.keys()
+
+class MultiModalKwargs(UserDict[str, NestedTensors]):
+    """
+    A dictionary that represents the keyword arguments to
+    [`torch.nn.Module.forward`][].
+    """
+
+    @staticmethod
+    @deprecated("`MultiModalKwargs.from_hf_inputs` is deprecated and "
+                "will be removed in v0.13. "
+                "Please use `MultiModalKwargsItems.from_hf_inputs` and "
+                "access the tensor data using `.get_data()`.")
+    def from_hf_inputs(
+        hf_inputs: "BatchFeature",
+        config_by_key: Mapping[str, MultiModalFieldConfig],
+    ):
+        return MultiModalKwargsItems.from_hf_inputs(hf_inputs, config_by_key) \
+            .get_data()
+
+    @staticmethod
+    @deprecated("`MultiModalKwargs.from_items` is deprecated and "
+                "will be removed in v0.13. "
+                "Please use `MultiModalKwargsItems.from_seq` and "
+                "access the tensor data using `.get_data()`.")
+    def from_items(
+        items: Sequence[MultiModalKwargsItem],
+        *,
+        pin_memory: bool = False,
+    ):
+        return MultiModalKwargsItems.from_seq(items) \
+            .get_data(pin_memory=pin_memory)
 
     @staticmethod
     def _try_stack(nested_tensors: NestedTensors,
@@ -813,92 +853,24 @@ def as_kwargs(
 
         return cast(BatchedTensorInputs, json_mapped)
 
-    def keys(self):
-        return self.get_data().keys()
-
-    def values(self):
-        return self.get_data().values()
-
-    def items(self):
-        return self.get_data().items()
-
-    def get(self, key: str, /, default=None):
-        return self.get_data().get(key, default)
-
-    def pop(self, key: str, *args, **kwargs):
-        data = dict(self.get_data())
-        res = data.pop(key, *args, **kwargs)
-
-        for items in self._items_by_modality.values():
-            for item in items:
-                item.pop(key, *args, **kwargs)
-
-        self._data = None
-
-        return res
-
-    def __iter__(self):
-        return iter(self.get_data())
-
     def __getitem__(self, key: str):
-        return self.get_data()[key]
+        if key not in self:
+            raise KeyError(f"Keyword argument {key!r} not found. "
+                           f"Available keys: {set(self.keys())}")
+
+        return super().__getitem__(key)
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
 
-        return self._items_by_modality == other._items_by_modality
-
-    def _validate_modality(self, method_name: str, modality: str) -> None:
-        if not self._items_by_modality:
-            raise RuntimeError(
-                f"`{method_name}` is not supported when "
-                "MultiModalKwargs is not initialized with `items`")
-
-        if modality not in self._items_by_modality:
-            available_modalities = set(self._items_by_modality.keys())
-            raise KeyError(f"Modality {modality!r} not found. "
-                           f"Available modalities: {available_modalities}")
-
-    def get_item_count(self, modality: str) -> int:
-        """Get the number of items belonging to a modality."""
-        self._validate_modality("get_item_count", modality)
-        return len(self._items_by_modality[modality])
-
-    def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem:
-        """
-        Get the keyword arguments corresponding to an item identified by
-        its modality and index.
-        """
-        self._validate_modality("get_item", modality)
-        return self._items_by_modality[modality][item_index]
-
-    def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
-        """
-        Get the keyword arguments corresponding to each item belonging to
-        a modality.
-        """
-        self._validate_modality("get_items", modality)
-        return self._items_by_modality[modality]
-
-    def get_data(self,
-                 *,
-                 pin_memory: bool = False) -> dict[str, NestedTensors]:
-        if self._data is not None:
-            return self._data
+        for k in self:
+            if k not in other:
+                return False
+            if not nested_tensors_equal(self[k], other[k]):
+                return False
 
-        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
-        for items in self._items_by_modality.values():
-            for item in items:
-                for key, elem in item.items():
-                    elems_by_key[key].append(elem)
-
-        data = {
-            key: elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-            for key, elems in elems_by_key.items() if len(elems) > 0
-        }
-        self._data = data
-        return data
+        return True
 
 
 MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]]
@@ -926,7 +898,7 @@ class MultiModalInputs(TypedDict):
     token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
-    mm_kwargs: MultiModalKwargs
+    mm_kwargs: MultiModalKwargsItems
     """Keyword arguments to be directly passed to the model after batching."""
 
     mm_hashes: Optional["MultiModalHashDict"]
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 37f561274272..88bb99529f20 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -16,7 +16,7 @@
 from .audio import AudioResampler
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
                      ImageItem, ModalityData, MultiModalDataDict,
-                     MultiModalFieldConfig, MultiModalKwargs, VideoItem)
+                     MultiModalFieldConfig, MultiModalKwargsItems, VideoItem)
 
 _T = TypeVar("_T")
 _I = TypeVar("_I")
@@ -157,19 +157,16 @@ def __init__(
         self.fields_config = fields_config
         self.required_fields = required_fields
 
-        self._kwargs = MultiModalKwargs.from_hf_inputs(
+        self._kwargs = MultiModalKwargsItems.from_hf_inputs(
             BatchFeature(dict(data)),
             fields_config,
         )
 
     def get_count(self) -> int:
-        return self._kwargs.get_item_count(self.modality)
+        return len(self._kwargs[self.modality])
 
     def get(self, index: int) -> Mapping[str, torch.Tensor]:
-        return {
-            k: v.data
-            for k, v in self._kwargs.get_item(self.modality, index).items()
-        }
+        return self._kwargs[self.modality][index].get_data()
 
     def get_processor_data(self) -> Mapping[str, object]:
         return {}
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 4684bf6f3d83..08113da74ada 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,8 +23,9 @@
 from .cache import MultiModalCache
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs,
-                     MultiModalKwargsItem, PlaceholderRange)
+                     MultiModalFieldConfig, MultiModalInputs,
+                     MultiModalKwargsItem, MultiModalKwargsItems,
+                     PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -985,7 +986,7 @@ def get_mm_max_tokens_per_item(
 MultiModalHashes = dict[str, list[str]]
 """
 A collection of hashes with a similar structure as
-[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
+[`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
 
@@ -1095,7 +1096,7 @@ def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
     ) -> Sequence[PromptUpdate]:
         """
         Given the original multi-modal items for this modality
@@ -1361,7 +1362,7 @@ def _merge_mm_kwargs(
         self,
         cache: ProcessingCache,
         mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
-        mm_missing_kwargs: MultiModalKwargs,
+        mm_missing_kwargs: MultiModalKwargsItems,
     ) -> dict[str, list[MultiModalKwargsItem]]:
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
@@ -1369,10 +1370,8 @@ def _merge_mm_kwargs(
         for modality, items_or_hashes in mm_cache_items_or_hashes.items():
             for item_or_hash in items_or_hashes:
                 if isinstance(item_or_hash, str):
-                    kw_item = mm_missing_kwargs.get_item(
-                        modality,
-                        mm_missing_next_idx[modality],
-                    )
+                    kw_item = mm_missing_kwargs[modality][
+                        mm_missing_next_idx[modality]]
                     cache.put(item_or_hash, kw_item)
                     mm_missing_next_idx[modality] += 1
                 else:
@@ -1390,7 +1389,8 @@ def _apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         (
             prompt_ids,
             mm_processed_data,
@@ -1403,7 +1403,7 @@ def _apply_hf_processor(
             enable_hf_prompt_update=True,
         )
 
-        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+        mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
             mm_processed_data,
             self._get_mm_fields_config(mm_processed_data,
                                        hf_processor_mm_kwargs),
@@ -1423,7 +1423,8 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
+    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
+               bool]:
         """
         Apply the HF processor on the full prompt text,
         caching the results and reusing cached results.
@@ -1468,7 +1469,7 @@ def _cached_apply_hf_processor(
             enable_hf_prompt_update=False,
         )
 
-        mm_missing_kwargs = MultiModalKwargs.from_hf_inputs(
+        mm_missing_kwargs = MultiModalKwargsItems.from_hf_inputs(
             mm_missing_processed_data,
             self._get_mm_fields_config(mm_missing_processed_data,
                                        hf_processor_mm_kwargs),
@@ -1480,7 +1481,7 @@ def _cached_apply_hf_processor(
             mm_missing_kwargs=mm_missing_kwargs,
         )
 
-        mm_kwargs = MultiModalKwargs([
+        mm_kwargs = MultiModalKwargsItems.from_seq([
             item for cache_items in mm_cache_items_merged.values()
             for item in cache_items
         ])
@@ -1585,14 +1586,11 @@ def _apply_prompt_updates(
 
     def _validate_mm_kwargs(
         self,
-        mm_kwargs: MultiModalKwargs,
+        mm_kwargs: MultiModalKwargsItems,
         mm_item_counts: Mapping[str, int],
     ) -> None:
         for modality, item_count in mm_item_counts.items():
-            if modality in mm_kwargs.modalities:
-                items = mm_kwargs.get_items(modality)
-            else:
-                items = []
+            items = mm_kwargs.get(modality, [])
 
             if len(items) != item_count:
                 raise RuntimeError(
@@ -1630,7 +1628,7 @@ def _maybe_apply_prompt_updates(
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         prompt_ids: list[int],
-        mm_kwargs: MultiModalKwargs,
+        mm_kwargs: MultiModalKwargsItems,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         unbound_prompt_updates = self._get_prompt_updates(
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index d876887fc155..2da9b4c72189 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -13,7 +13,7 @@
 from vllm.logger import init_logger
 
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs, MultiModalKwargs,
+                     MultiModalInputs, MultiModalKwargsItems,
                      MultiModalPlaceholderDict)
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          EncDecMultiModalProcessor)
@@ -43,7 +43,7 @@ class DummyDecoderData(NamedTuple):
     """Dummy data used for profiling."""
 
     prompt_token_ids: list[int]
-    multi_modal_data: MultiModalKwargs
+    multi_modal_data: MultiModalKwargsItems
     multi_modal_placeholders: MultiModalPlaceholderDict
 
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index a80f09bb1927..99f3db25a71d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -32,11 +32,13 @@
 
 if TYPE_CHECKING:
     from .inputs import (BatchedTensorInputs, MultiModalKwargs,
-                         MultiModalKwargsItem, MultiModalPlaceholderDict)
+                         MultiModalKwargsItem, MultiModalKwargsItems,
+                         MultiModalPlaceholderDict)
 else:
     BatchedTensorInputs = Any
     MultiModalKwargs = Any
     MultiModalKwargsItem = Any
+    MultiModalKwargsItems = Any
     MultiModalPlaceholderDict = Any
 
 global_thread_pool = ThreadPoolExecutor(
@@ -359,18 +361,20 @@ def argsort_mm_positions(
             "`group_mm_kwargs_by_modality` and will be removed in v0.13. "
             "Please use `group_mm_kwargs_by_modality` instead.")
 def group_mm_inputs_by_modality(
-        mm_inputs: list[MultiModalKwargs]) -> list[list[MultiModalKwargs]]:
+    mm_inputs: list[MultiModalKwargsItems]
+) -> list[list[MultiModalKwargsItems]]:
     if not mm_inputs:
         return []
 
-    def modality_group_func(mm_input: MultiModalKwargs) -> Union[str, int]:
+    def modality_group_func(
+            mm_input: MultiModalKwargsItems) -> Union[str, int]:
         # If the input has multiple modalities, return a id as the unique key
         # for the mm_input input.
-        if len(mm_input.modalities) > 1:
+        if len(mm_input) > 1:
             return id(mm_input)
 
-        elif len(mm_input.modalities) == 1:
-            return list(mm_input.modalities)[0]
+        elif len(mm_input) == 1:
+            return next(iter(mm_input.keys()))
 
         # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty,
         # this is used to make InternVL with legacy pipeline still work with v1.
@@ -397,12 +401,12 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
-    from vllm.multimodal.inputs import MultiModalKwargs
+    from vllm.multimodal.inputs import MultiModalKwargs, MultiModalKwargsItems
 
     for modality, items in groupby(mm_kwargs, key=lambda item: item.modality):
         items_lst = list(items)
 
-        # mm_kwargs_group = MultiModalKwargs(items_lst) \
+        # mm_kwargs_group = MultiModalKwargsItems.from_items(items_lst) \
         #    .get_data(pin_memory=pin_memory)
 
         # if device is not None:
@@ -417,7 +421,10 @@ def group_mm_kwargs_by_modality(
         # We will also need to update each model to remove `flatten_bn`.
         mm_kwargs_group = MultiModalKwargs.as_kwargs(
             MultiModalKwargs.batch(
-                [MultiModalKwargs([item]) for item in items_lst],
+                [
+                    MultiModalKwargsItems.from_seq([item]).get_data()
+                    for item in items_lst
+                ],
                 pin_memory=pin_memory,
             ),
             device=device,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2cb254381eff..347015c7ef3d 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -22,7 +22,6 @@
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 if TYPE_CHECKING:
-    from vllm.multimodal.inputs import NestedTensors
     from vllm.v1.worker.kv_connector_model_runner_mixin import (
         KVConnectorOutput)
 
@@ -523,7 +522,7 @@ def token_type_ids(self) -> list[int]:
     @property
     def multi_modal_data(self) -> MultiModalKwargs:
         if self.inputs["type"] == "multimodal":
-            return self.inputs["mm_kwargs"]
+            return self.inputs["mm_kwargs"].get_data()
 
         return MultiModalKwargs()
 
@@ -979,8 +978,7 @@ class SequenceGroupMetadata(
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
     token_type_ids: Optional[list[int]] = None
-    multi_modal_data: Optional[Union[MultiModalKwargs,
-                                     dict[str, "NestedTensors"]]] = None
+    multi_modal_data: Optional[MultiModalKwargs] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     encoder_seq_data: Optional[SequenceData] = None
     cross_block_table: Optional[list[int]] = None
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index c6a23cdbf65a..97d79c2ae093 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -310,7 +310,7 @@ def process_inputs(
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
             orig_sorted_mm_inputs = [
-                decoder_mm_inputs.get_item(modality, idx)
+                decoder_mm_inputs[modality][idx]
                 for modality, idx in sorted_mm_idxs
             ]
             sorted_mm_positions = [
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 2857d8ef4290..c8375d6f1551 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -18,12 +18,15 @@
 
 from vllm import envs
 from vllm.logger import init_logger
+# yapf: disable
 from vllm.multimodal.inputs import (BaseMultiModalField,
                                     MultiModalBatchedField,
                                     MultiModalFieldConfig, MultiModalFieldElem,
                                     MultiModalFlatField, MultiModalKwargs,
                                     MultiModalKwargsItem,
+                                    MultiModalKwargsItems,
                                     MultiModalSharedField, NestedTensors)
+# yapf: enable
 from vllm.v1.engine import UtilityResult
 
 logger = init_logger(__name__)
@@ -116,12 +119,11 @@ def enc_hook(self, obj: Any) -> Any:
         if isinstance(obj, MultiModalKwargsItem):
             return self._encode_mm_item(obj)
 
+        if isinstance(obj, MultiModalKwargsItems):
+            return self._encode_mm_items(obj)
+
         if isinstance(obj, MultiModalKwargs):
-            return [
-                self._encode_mm_item(item)
-                for itemlist in obj._items_by_modality.values()
-                for item in itemlist
-            ]
+            return self._encode_mm_kwargs(obj)
 
         if isinstance(obj, UtilityResult):
             result = obj.result
@@ -183,6 +185,12 @@ def _encode_tensor(
         dtype = str(obj.dtype).removeprefix("torch.")
         return dtype, obj.shape, data
 
+    def _encode_mm_items(self, items: MultiModalKwargsItems) -> dict[str, Any]:
+        return {
+            modality: [self._encode_mm_item(item) for item in itemlist]
+            for modality, itemlist in items.items()
+        }
+
     def _encode_mm_item(self,
                         item: MultiModalKwargsItem) -> list[dict[str, Any]]:
         return [self._encode_mm_field_elem(elem) for elem in item.values()]
@@ -200,6 +208,12 @@ def _encode_mm_field_elem(self,
             self._encode_mm_field(elem.field),
         }
 
+    def _encode_mm_kwargs(self, kw: MultiModalKwargs) -> dict[str, Any]:
+        return {
+            modality: self._encode_nested_tensors(data)
+            for modality, data in kw.items()
+        }
+
     def _encode_nested_tensors(self, nt: NestedTensors) -> Any:
         if isinstance(nt, torch.Tensor):
             return self._encode_tensor(nt)
@@ -260,8 +274,10 @@ def dec_hook(self, t: type, obj: Any) -> Any:
                 return slice(*obj)
             if issubclass(t, MultiModalKwargsItem):
                 return self._decode_mm_item(obj)
+            if issubclass(t, MultiModalKwargsItems):
+                return self._decode_mm_items(obj)
             if issubclass(t, MultiModalKwargs):
-                return MultiModalKwargs(self._decode_mm_items(obj))
+                return self._decode_mm_kwargs(obj)
             if t is UtilityResult:
                 return self._decode_utility_result(obj)
         return obj
@@ -315,8 +331,11 @@ def _decode_tensor(self, arr: Any) -> torch.Tensor:
         # Convert back to proper shape & type
         return arr.view(torch_dtype).view(shape)
 
-    def _decode_mm_items(self, obj: list[Any]) -> list[MultiModalKwargsItem]:
-        return [self._decode_mm_item(v) for v in obj]
+    def _decode_mm_items(self, obj: dict[str, Any]) -> MultiModalKwargsItems:
+        return MultiModalKwargsItems({
+            modality: [self._decode_mm_item(item) for item in itemlist]
+            for modality, itemlist in obj.items()
+        })
 
     def _decode_mm_item(self, obj: list[Any]) -> MultiModalKwargsItem:
         return MultiModalKwargsItem.from_elems(
@@ -339,6 +358,12 @@ def _decode_mm_field_elem(self, obj: dict[str,
         obj["field"] = factory_meth(None, *field_args).field
         return MultiModalFieldElem(**obj)
 
+    def _decode_mm_kwargs(self, obj: dict[str, Any]) -> MultiModalKwargs:
+        return MultiModalKwargs({
+            modality: self._decode_nested_tensors(data)
+            for modality, data in obj.items()
+        })
+
     def _decode_nested_tensors(self, obj: Any) -> NestedTensors:
         if isinstance(obj, (int, float)):
             # Although it violates NestedTensors type, MultiModalKwargs
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 3d4cf27a6ccf..8d08bd7742ff 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -10,8 +10,8 @@
 from typing_extensions import deprecated
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import (MultiModalKwargsItem,
+                                    MultiModalKwargsItems, PlaceholderRange)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
@@ -57,8 +57,10 @@ def num_tokens(self) -> int:
     @property
     @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be "
                 "removed in v0.13. Please use `mm_kwargs` instead.")
-    def mm_inputs(self) -> list[MultiModalKwargs]:
-        return [MultiModalKwargs([item]) for item in self.mm_kwargs]
+    def mm_inputs(self) -> list[MultiModalKwargsItems]:
+        return [
+            MultiModalKwargsItems.from_seq([item]) for item in self.mm_kwargs
+        ]
 
     def get_token_id(self, idx: int) -> int:
         if idx < self.num_prompt_tokens:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fc320be1c3bd..b49c3e05fa2d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2218,11 +2218,12 @@ def _get_mm_dummy_batch(
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
         # Result in the maximum GPU consumption of the model
-        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_item = dummy_mm_data[modality][0]
+        dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
         return next(mm_kwargs_group
                     for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-                        [dummy_mm_item] * max_items_per_batch,
+                        dummy_mm_items,
                         device=self.device,
                         pin_memory=self.pin_memory,
                     ))
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f7e68edba3a1..af837e4d946e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1824,11 +1824,12 @@ def _get_mm_dummy_batch(
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
         # Result in the maximum GPU consumption of the model
-        dummy_mm_item = dummy_mm_data.get_item(modality=modality, item_index=0)
+        dummy_mm_item = dummy_mm_data[modality][0]
+        dummy_mm_items = [dummy_mm_item] * max_items_per_batch
 
         return next(grouped_mm_kwargs
                     for _, _, grouped_mm_kwargs in group_mm_kwargs_by_modality(
-                        [dummy_mm_item] * max_items_per_batch,
+                        dummy_mm_items,
                         device=self.device,
                         pin_memory=self.pin_memory,
                     ))

From a6617386d7ef02457752d178deb5b751e7df2dc4 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Mon, 18 Aug 2025 17:58:11 +0800
Subject: [PATCH 163/233] [Bugfix] fix IntermediateTensors equal method
 (#23027)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 tests/test_sequence.py | 40 ++++++++++++++++++++++++++++++++++++++--
 vllm/sequence.py       |  8 +++++++-
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/tests/test_sequence.py b/tests/test_sequence.py
index c734c8514a6d..1b019be9e56d 100644
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
@@ -2,10 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+import torch
 
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import (CompletionSequenceGroupOutput, SequenceData,
-                           SequenceOutput)
+from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
+                           SequenceData, SequenceOutput)
 
 from .core.utils import create_dummy_prompt
 
@@ -98,3 +99,38 @@ def test_sequence_group_stage():
     assert seq_group.is_prefill() is True
     seq_group.update_num_computed_tokens(1)
     assert seq_group.is_prefill() is False
+
+
+def test_sequence_intermediate_tensors_equal():
+
+    class AnotherIntermediateTensors(IntermediateTensors):
+        pass
+
+    intermediate_tensors = IntermediateTensors({})
+    another_intermediate_tensors = AnotherIntermediateTensors({})
+    assert intermediate_tensors != another_intermediate_tensors
+
+    empty_intermediate_tensors_1 = IntermediateTensors({})
+    empty_intermediate_tensors_2 = IntermediateTensors({})
+    assert empty_intermediate_tensors_1 == empty_intermediate_tensors_2
+
+    different_key_intermediate_tensors_1 = IntermediateTensors(
+        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+    difference_key_intermediate_tensors_2 = IntermediateTensors(
+        {"2": torch.zeros([2, 4], dtype=torch.int32)})
+    assert (different_key_intermediate_tensors_1
+            != difference_key_intermediate_tensors_2)
+
+    same_key_different_value_intermediate_tensors_1 = IntermediateTensors(
+        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+    same_key_different_value_intermediate_tensors_2 = IntermediateTensors(
+        {"1": torch.zeros([2, 5], dtype=torch.int32)})
+    assert (same_key_different_value_intermediate_tensors_1
+            != same_key_different_value_intermediate_tensors_2)
+
+    same_key_same_value_intermediate_tensors_1 = IntermediateTensors(
+        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+    same_key_same_value_intermediate_tensors_2 = IntermediateTensors(
+        {"1": torch.zeros([2, 4], dtype=torch.int32)})
+    assert (same_key_same_value_intermediate_tensors_1 ==
+            same_key_same_value_intermediate_tensors_2)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 347015c7ef3d..43d5c8beef27 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1163,7 +1163,13 @@ def __len__(self):
         return len(self.tensors)
 
     def __eq__(self, other: object):
-        return isinstance(other, self.__class__) and self
+        if not isinstance(other, self.__class__):
+            return False
+        if self.tensors.keys() != other.tensors.keys():
+            return False
+        return all(
+            torch.equal(self.tensors[k], other.tensors[k])
+            for k in self.tensors)
 
     def __repr__(self) -> str:
         return f"IntermediateTensors(tensors={self.tensors})"

From a77538635edd7340aee348748516b603502c8667 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 18 Aug 2025 20:31:53 +0800
Subject: [PATCH 164/233] [Refactor] Get prompt updates earlier (#23097)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/deepseek_vl2.py    |  6 +-
 vllm/model_executor/models/h2ovl.py           |  8 +-
 vllm/model_executor/models/pixtral.py         | 15 ++--
 .../models/qwen2_5_omni_thinker.py            | 33 ++++----
 vllm/model_executor/models/voxtral.py         | 11 ++-
 vllm/multimodal/processing.py                 | 80 ++++++++++++-------
 6 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index e881e9c6ddb6..421076348386 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -25,7 +25,8 @@
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, MultiModalHashes,
+                                        BaseProcessingInfo,
+                                        MultiModalProcessingInfo,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
@@ -291,8 +292,7 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 9ab3f4d0d9a1..75ab4dbe7b57 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -20,8 +20,9 @@
 from vllm.multimodal.inputs import MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
-from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.processing import (MultiModalProcessingInfo,
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .intern_vit import InternVisionModel
@@ -480,8 +481,7 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
         # invariant of how many images are passed per prompt, we only
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 5427e9a5935c..25be44e3f6e1 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -39,7 +39,8 @@
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, MultiModalHashes,
+                                        BaseProcessingInfo,
+                                        MultiModalProcessingInfo,
                                         PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -309,14 +310,8 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
-        (
-            prompt_ids,
-            mm_kwargs,
-            mm_hashes,
-            _,
-        ) = super()._cached_apply_hf_processor(
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
+        prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
@@ -325,7 +320,7 @@ def _cached_apply_hf_processor(
         )
 
         # NOTE: The tokens are already inserted by the chat template
-        return prompt_ids, mm_kwargs, mm_hashes, True
+        return prompt_ids, mm_info, True
 
 
 @MULTIMODAL_REGISTRY.register_processor(PixtralMultiModalProcessor,
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index d43573ea2752..5aadebc33324 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -59,6 +59,7 @@
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        MultiModalPromptUpdates,
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
@@ -88,10 +89,7 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
     video_grid_sizes = video_grid_thw.prod(-1)
 
-    # vllm use `second_per_grid_ts` to compute multimodal rotary embedding
-    video_second_per_grid = hf_inputs.get("video_second_per_grid", None)
-    if video_second_per_grid is not None:
-        hf_inputs["second_per_grid_ts"] = video_second_per_grid
+    num_videos = len(video_grid_sizes)
 
     return dict(
         input_audio_features=MultiModalFieldConfig.flat_from_sizes(
@@ -109,6 +107,7 @@ def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
             "video", video_grid_sizes),
         video_grid_thw=MultiModalFieldConfig.batched("video"),
         second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+        use_audio_in_video=MultiModalFieldConfig.shared("video", num_videos),
     )
 
 
@@ -251,6 +250,14 @@ def _call_hf_processor(
         if ('audio_feature_lengths' not in hf_inputs
                 and feature_attention_mask is not None):
             hf_inputs['audio_feature_lengths'] = feature_attention_mask.sum(-1)
+
+        video_second_per_grid = hf_inputs.get("video_second_per_grid", None)
+        if video_second_per_grid is not None:
+            hf_inputs["second_per_grid_ts"] = video_second_per_grid
+
+        use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
+        hf_inputs["use_audio_in_video"] = torch.tensor(use_audio_in_video)
+
         return hf_inputs
 
     def _get_mm_fields_config(
@@ -263,27 +270,20 @@ def _get_mm_fields_config(
     def _maybe_apply_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
         prompt_ids: list[int],
         mm_kwargs: MultiModalKwargsItems,
+        mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         """
         Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
         """
-        unbound_prompt_updates = self._get_prompt_updates(
-            mm_items,
-            hf_processor_mm_kwargs,
-            mm_kwargs,
-        )
-        mm_prompt_updates = self._bind_and_group_updates(
-            unbound_prompt_updates)
-
         mm_item_counts = mm_items.get_all_counts()
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
 
-        use_audio_in_video = hf_processor_mm_kwargs.get(
-            "use_audio_in_video", False)
+        use_audio_in_video = (all(
+            item["use_audio_in_video"].data
+            for item in mm_kwargs["video"]) if "video" in mm_kwargs else False)
 
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
@@ -316,9 +316,6 @@ def _maybe_apply_prompt_updates(
         tokenizer = self.info.get_tokenizer()
         prompt = decode_tokens(tokenizer, prompt_ids)
 
-        if use_audio_in_video:
-            mm_kwargs["use_audio_in_video"] = True
-
         return prompt_ids, prompt, mm_placeholders
 
     def _get_prompt_updates(
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 70ba561642a2..d0e8e3d39b45 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -35,7 +35,8 @@
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, MultiModalHashes,
+                                        BaseProcessingInfo,
+                                        MultiModalProcessingInfo,
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -289,10 +290,8 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
-        prompt_ids, mm_kwargs, mm_hashes, _ = super(
-        )._cached_apply_hf_processor(
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
+        prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
@@ -301,7 +300,7 @@ def _cached_apply_hf_processor(
         )
 
         # NOTE: The tokens are already inserted by the chat template
-        return prompt_ids, mm_kwargs, mm_hashes, True
+        return prompt_ids, mm_info, True
 
     def _get_data_parser(self) -> MultiModalDataParser:
         sampling_rate = self.info.get_hf_processor().sampling_rate
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 08113da74ada..e1363b7b0d89 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -989,6 +989,18 @@ def get_mm_max_tokens_per_item(
 [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
 """
 
+MultiModalPromptUpdates = dict[str, Sequence[BoundPromptUpdate]]
+"""
+A collection of prompt updates with a similar structure as
+[`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems].
+"""
+
+
+class MultiModalProcessingInfo(NamedTuple):
+    kwargs: MultiModalKwargsItems
+    hashes: Optional[MultiModalHashes]
+    prompt_updates: MultiModalPromptUpdates
+
 
 class BaseMultiModalProcessor(ABC, Generic[_I]):
     """
@@ -1363,7 +1375,7 @@ def _merge_mm_kwargs(
         cache: ProcessingCache,
         mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
         mm_missing_kwargs: MultiModalKwargsItems,
-    ) -> dict[str, list[MultiModalKwargsItem]]:
+    ) -> MultiModalKwargsItems:
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
         merged_items = defaultdict[str, list[MultiModalKwargsItem]](list)
@@ -1379,7 +1391,7 @@ def _merge_mm_kwargs(
 
                 merged_items[modality].append(kw_item)
 
-        return dict(merged_items)
+        return MultiModalKwargsItems(merged_items)
 
     def _apply_hf_processor(
         self,
@@ -1389,8 +1401,7 @@ def _apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
             mm_processed_data,
@@ -1413,7 +1424,21 @@ def _apply_hf_processor(
                                          tokenization_kwargs)
                      if return_mm_hashes else None)
 
-        return prompt_ids, mm_kwargs, mm_hashes, is_update_applied
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_data_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
+
+        mm_info = MultiModalProcessingInfo(
+            kwargs=mm_kwargs,
+            hashes=mm_hashes,
+            prompt_updates=mm_prompt_updates,
+        )
+
+        return prompt_ids, mm_info, is_update_applied
 
     def _cached_apply_hf_processor(
         self,
@@ -1423,8 +1448,7 @@ def _cached_apply_hf_processor(
         tokenization_kwargs: Mapping[str, object],
         *,
         return_mm_hashes: bool,
-    ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
-               bool]:
+    ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
         caching the results and reusing cached results.
@@ -1475,18 +1499,27 @@ def _cached_apply_hf_processor(
                                        hf_processor_mm_kwargs),
         )
 
-        mm_cache_items_merged = self._merge_mm_kwargs(
+        mm_kwargs = self._merge_mm_kwargs(
             cache,
             mm_cache_items_or_hashes=mm_cache_items_or_hashes,
             mm_missing_kwargs=mm_missing_kwargs,
         )
 
-        mm_kwargs = MultiModalKwargsItems.from_seq([
-            item for cache_items in mm_cache_items_merged.values()
-            for item in cache_items
-        ])
+        unbound_prompt_updates = self._get_prompt_updates(
+            mm_data_items,
+            hf_processor_mm_kwargs,
+            mm_kwargs,
+        )
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
+
+        mm_info = MultiModalProcessingInfo(
+            kwargs=mm_kwargs,
+            hashes=mm_hashes_to_return,
+            prompt_updates=mm_prompt_updates,
+        )
 
-        return prompt_ids, mm_kwargs, mm_hashes_to_return, is_update_applied
+        return prompt_ids, mm_info, is_update_applied
 
     def _bind_and_group_updates(
         self,
@@ -1626,19 +1659,11 @@ def _validate_mm_placeholders(
     def _maybe_apply_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
         prompt_ids: list[int],
         mm_kwargs: MultiModalKwargsItems,
+        mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
-        unbound_prompt_updates = self._get_prompt_updates(
-            mm_items,
-            hf_processor_mm_kwargs,
-            mm_kwargs,
-        )
-        mm_prompt_updates = self._bind_and_group_updates(
-            unbound_prompt_updates)
-
         mm_item_counts = mm_items.get_all_counts()
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
 
@@ -1694,8 +1719,7 @@ def apply(
 
         (
             prompt_ids,
-            mm_kwargs,
-            mm_hashes,
+            mm_info,
             is_update_applied,
         ) = self._cached_apply_hf_processor(
             prompt,
@@ -1708,9 +1732,9 @@ def apply(
         # NOTE: tokenization_kwargs are not required to init processor
         prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
             mm_items=mm_items,
-            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             prompt_ids=prompt_ids,
-            mm_kwargs=mm_kwargs,
+            mm_kwargs=mm_info.kwargs,
+            mm_prompt_updates=mm_info.prompt_updates,
             is_update_applied=is_update_applied,
         )
 
@@ -1723,8 +1747,8 @@ def apply(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=prompt_ids,
-            mm_kwargs=mm_kwargs,
-            mm_hashes=mm_hashes,
+            mm_kwargs=mm_info.kwargs,
+            mm_hashes=mm_info.hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 

From dd903b79d9a6880a11fcc4d6232ca6a0000ab34a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=A8=E6=9C=B1=20=C2=B7=20Kiki?= <baofa.fan@daocloud.io>
Date: Mon, 18 Aug 2025 20:32:13 +0800
Subject: [PATCH 165/233] chore: remove unnecessary patch_padding_side for the
 chatglm model (#23090)

Signed-off-by: carlory <baofa.fan@daocloud.io>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../generation/vlm_utils/model_utils.py       |  2 --
 vllm/transformers_utils/tokenizer.py          | 27 -------------------
 2 files changed, 29 deletions(-)

diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 5e8dac6bce96..e43db4937e46 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -19,7 +19,6 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.sequence import SampleLogprobs
-from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import is_list_of
 
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
@@ -343,7 +342,6 @@ def _generate(self, *args, **kwargs):
 def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4V."""
     hf_processor = hf_model.processor
-    patch_padding_side(hf_processor)
 
     def processor(*args, text="", images=None, **kwargs):
         if images is None:
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index d2be2ceeeae6..4546f60aae67 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -7,7 +7,6 @@
 import warnings
 from functools import lru_cache
 from pathlib import Path
-from types import MethodType
 from typing import TYPE_CHECKING, Any, Optional, Union
 
 import huggingface_hub
@@ -144,26 +143,6 @@ def __reduce__(self):
     return cached_tokenizer
 
 
-def patch_padding_side(tokenizer: PreTrainedTokenizer) -> None:
-    """Patch _pad method to accept `padding_side` for older tokenizers."""
-    orig_pad = tokenizer._pad
-
-    def _pad(
-        self: PreTrainedTokenizer,
-        *args,
-        padding_side: Optional[str] = None,
-        **kwargs,
-    ):
-        if padding_side is not None and padding_side != self.padding_side:
-            msg = ("`padding_side` argument is not supported by "
-                   f"{type(tokenizer).__name__} and will be ignored.")
-            warnings.warn(msg, stacklevel=2)
-
-        return orig_pad(*args, **kwargs)
-
-    tokenizer._pad = MethodType(_pad, tokenizer)
-
-
 def get_tokenizer(
     tokenizer_name: Union[str, Path],
     *args,
@@ -271,12 +250,6 @@ def get_tokenizer(
             }
             tokenizer.add_special_tokens(special_tokens_map)
 
-        # NOTE: We can remove this after https://github.com/zai-org/ChatGLM3/issues/1324
-        if type(tokenizer).__name__ in ("ChatGLMTokenizer",
-                                        "ChatGLM4Tokenizer"):
-            assert isinstance(tokenizer, PreTrainedTokenizer)
-            patch_padding_side(tokenizer)
-
         if not isinstance(tokenizer, PreTrainedTokenizerFast):
             logger.warning(
                 "Using a slow tokenizer. This might cause a significant "

From c2eef8fadd17c7b232b4fc860877f7810447a86d Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Mon, 18 Aug 2025 15:35:48 +0200
Subject: [PATCH 166/233] [Bugfix] Support compile for Transformers multimodal
 (#23095)

Signed-off-by: raushan <raushan@huggingface.co>
---
 vllm/model_executor/models/transformers.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 712667b1e274..ed9d6c0ab4ce 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -709,6 +709,13 @@ def _can_concat(x: list[torch.Tensor]):
     MultiModalProcessor,
     info=MultiModalProcessingInfo,
     dummy_inputs=MultiModalDummyInputsBuilder)
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })  # set `positions` to last dim to support Qwen-mrope
 class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
     # Backwards compatibility for prev released models. State dicts back then
     # had different formats and cannot be loaded with `AutoModel` mapping as is

From b80575ac690948fe3edb619224a5d85845672aaa Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 18 Aug 2025 15:14:01 -0400
Subject: [PATCH 167/233] [CI Bugfix] Pin `openai<1.100` to unblock CI (#23118)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 requirements/common.txt | 2 +-
 requirements/docs.txt   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 1a8fea0dd7d9..e7bc5682a696 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.99.1  # For Responses API with reasoning content
+openai >= 1.99.1, < 1.100.0  # For Responses API with reasoning content
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
diff --git a/requirements/docs.txt b/requirements/docs.txt
index a24b9c7e924b..87a44fc99da1 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -18,7 +18,7 @@ cbor2
 cloudpickle
 fastapi
 msgspec
-openai
+openai < 1.100.0
 openai-harmony
 partial-json-parser
 pillow

From 3f93171eb281a952e4f6eae31ebf0a5afc5d95f5 Mon Sep 17 00:00:00 2001
From: Breno Baldas Skuk <breno.skuk@hcompany.ai>
Date: Tue, 19 Aug 2025 00:22:59 +0200
Subject: [PATCH 168/233] fix: OpenAI SDK compat (ResponseTextConfig) (#23126)

Signed-off-by: breno.skuk <breno.skuk@hcompany.ai>
Signed-off-by: Breno Baldas Skuk <breno.skuk@hcompany.ai>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 requirements/common.txt             |  2 +-
 requirements/docs.txt               |  2 +-
 vllm/entrypoints/openai/protocol.py | 10 +++++++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index e7bc5682a696..1a8fea0dd7d9 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
-openai >= 1.99.1, < 1.100.0  # For Responses API with reasoning content
+openai >= 1.99.1  # For Responses API with reasoning content
 pydantic >= 2.10
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 87a44fc99da1..a24b9c7e924b 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -18,7 +18,7 @@ cbor2
 cloudpickle
 fastapi
 msgspec
-openai < 1.100.0
+openai
 openai-harmony
 partial-json-parser
 pillow
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 543701ed144e..61f1a09d3ac1 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -20,7 +20,15 @@
 from openai.types.responses import (ResponseFunctionToolCall,
                                     ResponseInputItemParam, ResponseOutputItem,
                                     ResponsePrompt, ResponseReasoningItem,
-                                    ResponseStatus, ResponseTextConfig)
+                                    ResponseStatus)
+
+# Backward compatibility for OpenAI client versions
+try:  # For older openai versions (< 1.100.0)
+    from openai.types.responses import ResponseTextConfig
+except ImportError:  # For newer openai versions (>= 1.100.0)
+    from openai.types.responses import (ResponseFormatTextConfig as
+                                        ResponseTextConfig)
+
 from openai.types.responses.response import ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning

From 9c4f6b3a801fe1d0a3a79a98e76b7149d43d5a01 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 18 Aug 2025 18:25:49 -0400
Subject: [PATCH 169/233] Use Blackwell FlashInfer MXFP4 MoE by default if
 available  (#23008)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 10 ++--
 .../layers/quantization/mxfp4.py              | 60 ++++++++++++++-----
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index c3c6e4782750..4924f1fadb3b 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -762,11 +762,11 @@ def __init__(
         self.global_num_experts = num_experts + num_redundant_experts
 
         # we padding globally so EP buffer allocation works
-        if (quant_config and quant_config.get_name() == "mxfp4"
-                and (current_platform.is_rocm()
-                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                     or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16)):
-            hidden_size = round_up(hidden_size, 256)
+        if quant_config and quant_config.get_name() == "mxfp4":
+            from vllm.model_executor.layers.quantization.mxfp4 import (  # noqa: E501
+                should_use_flashinfer_mxfp4)
+            if current_platform.is_rocm() or should_use_flashinfer_mxfp4():
+                hidden_size = round_up(hidden_size, 256)
 
         # For smuggling this layer into the fused moe custom op
         compilation_config = vllm_config.compilation_config
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 3c5d83037cde..6a190ebbc063 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -6,6 +6,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
@@ -26,12 +27,38 @@
 from vllm.scalar_type import scalar_types
 from vllm.utils import (has_triton_kernels, is_torch_equal_or_newer,
                         next_power_of_2, round_up)
+from vllm.utils.flashinfer import has_flashinfer
 
-if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
-    # from flashinfer.fused_moe import cutlass_fused_moe
-    from flashinfer import (mxfp8_quantize, shuffle_matrix_a,
-                            shuffle_matrix_sf_a, trtllm_fp4_block_scale_moe)
+logger = init_logger(__name__)
+
+
+def _should_use_flashinfer_mxfp4_bf16():
+    """Determine if FlashInfer MXFP4 BF16 should be used."""
+    # If explicitly set, respect the setting
+    if envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"):
+        return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
+
+    # Enable by default on SM100 if MXFP8 is not explicitly enabled
+    if (current_platform.is_device_capability(100) and has_flashinfer()
+            and not envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8")):
+        logger.info_once(
+            "Enabling FlashInfer MXFP4 BF16 backend by default for Blackwell. "
+            "For faster performance, consider setting "
+            "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, "
+            "though this may impact accuracy.")
+        return True
+
+    return False
+
+
+def _should_use_flashinfer_mxfp4_mxfp8():
+    """Determine if FlashInfer MXFP4 MXFP8 should be used."""
+    return envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+
+
+def should_use_flashinfer_mxfp4():
+    return (_should_use_flashinfer_mxfp4_mxfp8()
+            or _should_use_flashinfer_mxfp4_bf16())
 
 
 class Mxfp4Config(QuantizationConfig):
@@ -87,12 +114,18 @@ def __init__(self, moe: FusedMoEConfig):
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
 
+        if current_platform.is_device_capability(100) and not has_flashinfer():
+            logger.warning_once(
+                "MXFP4 MoE is enabled on Blackwell but FlashInfer "
+                "is not available. This may result in degraded performance. "
+                "Please `pip install vllm[flashinfer]` for best results.")
+
     def _should_use_marlin(self):
         if envs.VLLM_MXFP4_USE_MARLIN is not None:
             return envs.VLLM_MXFP4_USE_MARLIN
         if current_platform.is_cuda() and \
-                not current_platform.has_device_capability(100):
-            if not current_platform.is_device_capability(90):
+                not current_platform.is_device_capability(100):
+            if not current_platform.has_device_capability(90):
                 # marlin kernel has better performance on ampere
                 return True
             if not has_triton_kernels():
@@ -138,8 +171,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
             layer.hidden_size = hidden_size
             layer.intermediate_size_per_partition = \
                 intermediate_size_per_partition_after_pad
-        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        elif should_use_flashinfer_mxfp4():
             # pad the intermediate size to be a multiple of 2 * mxfp4_block
             # for to hold non-uniform sharded tensor as well as swizzling
             # other padding to increase performance
@@ -230,8 +262,8 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
     def process_weights_after_loading(self, layer):
         if self.use_marlin:
             prepare_moe_fp4_layer_for_marlin(layer)
-        elif (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-              or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        elif should_use_flashinfer_mxfp4():
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
             layer.gemm1_alpha = Parameter(torch.tensor(
                 [1.702] * self.num_experts, dtype=torch.float32).cuda(),
                                           requires_grad=False)
@@ -478,11 +510,11 @@ def apply(
             logical_replica_count), (
                 "MXFP4 are not supported with this configuration.")
 
-        if (envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-                or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16):
+        if should_use_flashinfer_mxfp4():
+            from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
             assert not self.moe.use_ep, (
                 "EP is not supported for flashinfer mxfp4 moe backend yet.")
-            if envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16:
+            if _should_use_flashinfer_mxfp4_bf16():
                 assert x.dtype == torch.bfloat16
                 x_quant = x
                 x_scale = None

From 2c29786f98d20f87848b9da2fb0dc40c4126ae52 Mon Sep 17 00:00:00 2001
From: Xiang Xu <117880274+xiangxu-google@users.noreply.github.com>
Date: Mon, 18 Aug 2025 16:23:33 -0700
Subject: [PATCH 170/233] Install tpu_info==0.4.0 to fix core dump for TPU
 (#23135)

---
 requirements/tpu.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 7bb77c4a9963..7ea239b48ea2 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -11,6 +11,7 @@ ray[default]
 ray[data]
 setuptools==78.1.0
 nixl==0.3.0
+tpu_info==0.4.0
 
 # Install torch_xla
 --pre

From d4356b2db948714ac391a8cbb98ee108290c7733 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 18 Aug 2025 16:58:05 -0700
Subject: [PATCH 171/233] [Misc] Minor refactoring for prepare_inputs (#23116)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 43 +++++++++++++++---------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b49c3e05fa2d..43119fcad3fb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -757,10 +757,19 @@ def _prepare_inputs(
         # Prepare the attention metadata.
         self.query_start_loc_np[0] = 0
         self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
+        # Note: pad query_start_loc to be non-decreasing, as kernels
+        # like FlashAttention requires that
+        self.query_start_loc_np[num_reqs + 1:].fill(cu_num_tokens[-1])
+        self.query_start_loc.copy_(self.query_start_loc_cpu, non_blocking=True)
+        query_start_loc = self.query_start_loc[:num_reqs + 1]
 
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
             num_scheduled_tokens)
+        # Fill unused with 0 for full cuda graph mode.
+        self.seq_lens_np[num_reqs:].fill(0)
+        self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
+        seq_lens = self.seq_lens[:num_reqs]
 
         # Copy the tensors to the GPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
@@ -776,22 +785,6 @@ def _prepare_inputs(
                 self.positions_cpu[:total_num_scheduled_tokens],
                 non_blocking=True)
 
-        self.query_start_loc[:num_reqs + 1].copy_(
-            self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
-        self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
-                                       non_blocking=True)
-
-        # Fill unused with 0 for full cuda graph mode.
-        self.seq_lens[num_reqs:].fill_(0)
-        # Note: pad query_start_loc to be non-decreasing, as kernels
-        # like FlashAttention requires that
-        self.query_start_loc[num_reqs + 1:].fill_(
-            self.query_start_loc_cpu[num_reqs].item())
-
-        query_start_loc = self.query_start_loc[:num_reqs + 1]
-
-        spec_decode_common_attn_metadata = None
-
         use_spec_decode = len(
             scheduler_output.scheduled_spec_decode_tokens) > 0
         if not use_spec_decode:
@@ -860,6 +853,13 @@ def _prepare_inputs(
                         per_layer_metadata[layer_name]
                     attn_metadata[layer_name] = encoder_attn_metadata
 
+        # Used in the below loop.
+        query_start_loc_cpu = self.query_start_loc_cpu[:num_reqs + 1]
+        seq_lens_cpu = self.seq_lens_cpu[:num_reqs]
+        num_computed_tokens_cpu = (
+            self.input_batch.num_computed_tokens_cpu_tensor[:num_reqs])
+        spec_decode_common_attn_metadata = None
+
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
@@ -874,12 +874,11 @@ def _prepare_inputs(
             blk_table.slot_mapping[total_num_scheduled_tokens:].fill_(-1)
 
             common_attn_metadata = CommonAttentionMetadata(
-                query_start_loc=self.query_start_loc[:num_reqs + 1],
-                query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
-                seq_lens=self.seq_lens[:num_reqs],
-                seq_lens_cpu=self.seq_lens_cpu[:num_reqs],
-                num_computed_tokens_cpu=self.input_batch.
-                num_computed_tokens_cpu_tensor[:num_reqs],
+                query_start_loc=query_start_loc,
+                query_start_loc_cpu=query_start_loc_cpu,
+                seq_lens=seq_lens,
+                seq_lens_cpu=seq_lens_cpu,
+                num_computed_tokens_cpu=num_computed_tokens_cpu,
                 num_reqs=num_reqs,
                 num_actual_tokens=total_num_scheduled_tokens,
                 max_query_len=max_num_scheduled_tokens,

From f4aa6ed4735de58a5061244b5452a60d22a7c877 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 18 Aug 2025 17:20:38 -0700
Subject: [PATCH 172/233] [Spec Decode] Make `propose_draft_token_ids`
 non-blocking for lower TTFT (#23041)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/core/test_async_scheduler.py  |  1 -
 tests/v1/core/test_scheduler.py        | 26 ++---------------
 tests/v1/kv_connector/unit/utils.py    |  1 -
 vllm/v1/core/sched/interface.py        | 10 ++++++-
 vllm/v1/core/sched/scheduler.py        | 39 +++++++++++++++++---------
 vllm/v1/engine/core.py                 | 10 +++++++
 vllm/v1/executor/abstract.py           |  8 ++++--
 vllm/v1/executor/multiproc_executor.py |  8 +++++-
 vllm/v1/outputs.py                     | 13 ++++++---
 vllm/v1/spec_decode/medusa.py          |  4 ++-
 vllm/v1/worker/gpu_model_runner.py     | 37 +++++++++++++++---------
 vllm/v1/worker/gpu_worker.py           |  6 +++-
 vllm/v1/worker/tpu_model_runner.py     |  1 -
 13 files changed, 100 insertions(+), 64 deletions(-)

diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py
index 3a9492269f9c..c153e38fe3df 100644
--- a/tests/v1/core/test_async_scheduler.py
+++ b/tests/v1/core/test_async_scheduler.py
@@ -22,7 +22,6 @@ def _make_model_runner_output(
             for i, req_id in enumerate(req_ids)
         },
         sampled_token_ids=[[i] for i in range(len(req_ids))],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 23762a0fb622..070008fcbf59 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -14,7 +14,7 @@
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 from vllm.v1.structured_output.request import StructuredOutputRequest
@@ -158,7 +158,6 @@ def test_schedule_partial_requests():
         # Only the first request has a sampled token id because
         # the rest requests are still being prefilled.
         sampled_token_ids=[[0], [], []],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -209,7 +208,6 @@ def test_no_mm_input_chunking():
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         sampled_token_ids=[[] for _ in range(len(requests))],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -273,7 +271,6 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         sampled_token_ids=[[] for _ in range(len(requests))],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -298,7 +295,6 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -355,7 +351,6 @@ def test_stop_via_update_from_output():
         sampled_token_ids=[[EOS_TOKEN_ID],
                            [10,
                             11]],  # First request hits EOS, second continues
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -409,7 +404,6 @@ def test_stop_via_update_from_output():
         },
         sampled_token_ids=[[10, 42, 12],
                            [13, 14]],  # First request hits stop token
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -462,7 +456,6 @@ def test_stop_via_update_from_output():
         },
         sampled_token_ids=[[10, 11, 12],
                            [13]],  # First request exceeds max_tokens
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -505,7 +498,6 @@ def test_stop_via_update_from_output():
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
         sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[])
@@ -554,7 +546,6 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
         sampled_token_ids=[[0]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -572,7 +563,6 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
         sampled_token_ids=[[0]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -608,7 +598,6 @@ def test_preempt_during_execution():
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
         sampled_token_ids=[[0]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -626,7 +615,6 @@ def test_preempt_during_execution():
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
         sampled_token_ids=[[42]],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -682,13 +670,14 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[0] for _ in range(len(requests))],
-        spec_token_ids=spec_tokens,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
     )
     engine_core_outputs = scheduler.update_from_output(output,
                                                        model_runner_output)
+    draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
+    scheduler.update_draft_token_ids(draft_token_ids)
 
     for i in range(len(requests)):
         running_req = scheduler.running[i]
@@ -722,7 +711,6 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=output_tokens,
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -851,7 +839,6 @@ def test_kv_connector_basic():
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[1000]] * len(req_ids),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -898,7 +885,6 @@ def test_kv_connector_basic():
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[1000]] * len(req_ids),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -966,7 +952,6 @@ def test_kv_connector_unable_to_allocate():
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[1000]] * len(req_ids),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1048,7 +1033,6 @@ def test_kv_connector_handles_preemption():
         req_ids=req_ids,
         req_id_to_index=req_to_index,
         sampled_token_ids=[[1000]] * len(req_ids),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1142,7 +1126,6 @@ def make_output(scheduler: Scheduler):
             for i, req in enumerate(scheduler.running)
         },
         sampled_token_ids=[[1000]] * len(scheduler.running),
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1468,7 +1451,6 @@ def test_priority_scheduling_preemption():
             for i, req in enumerate(low_priority_requests)
         },
         sampled_token_ids=[[100] for _ in low_priority_requests],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1541,7 +1523,6 @@ def test_priority_scheduling_no_preemption_when_space_available():
             for i, req in enumerate(low_priority_requests)
         },
         sampled_token_ids=[[100] for _ in low_priority_requests],
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=[],
@@ -1783,7 +1764,6 @@ def test_priority_scheduling_heap_property():
                 req_ids=[req.req_id],
                 req_id_to_index={req.req_id: 0},
                 sampled_token_ids=[[100]],
-                spec_token_ids=None,
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[],
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 8c5d132c00ae..a47f583b329e 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -200,7 +200,6 @@ def create_model_runner_output(
         req_ids=req_ids,
         req_id_to_index=req_id_to_index,
         sampled_token_ids=sampled_token_ids,
-        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
         pooler_output=None,
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index dd5052a3480b..5b1de3a66ceb 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -9,7 +9,7 @@
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.engine import EngineCoreOutputs
     from vllm.v1.metrics.stats import SchedulerStats
-    from vllm.v1.outputs import ModelRunnerOutput
+    from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
     from vllm.v1.request import Request, RequestStatus
 
 
@@ -61,6 +61,14 @@ def update_from_output(
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def update_draft_token_ids(
+        self,
+        draft_token_ids: "DraftTokenIds",
+    ) -> None:
+        """Update the draft token ids for the scheduled requests."""
+        raise NotImplementedError
+
     @abstractmethod
     def add_request(self, request: "Request") -> None:
         """Add a new request to the scheduler's internal queue.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 981023409045..b3defa443186 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -30,7 +30,7 @@
                             EngineCoreOutputs)
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
-from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
@@ -141,7 +141,6 @@ def __init__(
             cache_size=encoder_cache_size)
 
         speculative_config = vllm_config.speculative_config
-
         self.use_eagle = False
         self.num_spec_tokens = self.num_lookahead_tokens = 0
         if speculative_config:
@@ -760,7 +759,6 @@ def update_from_output(
         model_runner_output: ModelRunnerOutput,
     ) -> dict[int, EngineCoreOutputs]:
         sampled_token_ids = model_runner_output.sampled_token_ids
-        spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
@@ -845,20 +843,9 @@ def update_from_output(
                 request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
                     req_id, new_token_ids)
 
-            # spec_token_ids comes from the model runner output
             if num_nans_in_logits is not None and req_id in num_nans_in_logits:
                 request.num_nans_in_logits = num_nans_in_logits[req_id]
 
-            # Add newly generated spec token ids to the request.
-            if spec_token_ids is not None:
-                if self.structured_output_manager.should_advance(request):
-                    metadata = request.structured_output_request
-                    # Needs to happen after new_token_ids are accepted.
-                    request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
-                        spec_token_ids[req_index])
-                else:
-                    request.spec_token_ids = spec_token_ids[req_index]
-
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
             if new_token_ids or pooler_output is not None \
@@ -963,6 +950,30 @@ def _free_encoder_inputs(self, request: Request) -> None:
                 self.encoder_cache_manager.free_encoder_input(
                     request, input_id)
 
+    def update_draft_token_ids(
+        self,
+        draft_token_ids: DraftTokenIds,
+    ) -> None:
+        for req_id, spec_token_ids in zip(
+                draft_token_ids.req_ids,
+                draft_token_ids.draft_token_ids,
+        ):
+            request = self.requests.get(req_id)
+            if request is None or request.is_finished():
+                # The request may have been finished. Skip.
+                continue
+
+            # Add newly generated spec token ids to the request.
+            if not spec_token_ids:
+                # NOTE(woosuk): request.spec_token_ids should be updated.
+                request.spec_token_ids.clear()
+            elif self.structured_output_manager.should_advance(request):
+                metadata = request.structured_output_request
+                request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
+                    spec_token_ids)
+            else:
+                request.spec_token_ids = spec_token_ids
+
     def get_request_counts(self) -> tuple[int, int]:
         """Returns (num_running_reqs, num_waiting_reqs)."""
         return len(self.running), len(self.waiting)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 1e52f93a581b..32765cda6482 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -126,6 +126,7 @@ def __init__(self,
             > 1,
             log_stats=self.log_stats,
         )
+        self.use_spec_decode = vllm_config.speculative_config is not None
 
         self.mm_input_cache_server = MultiModalInputCacheServer(
             vllm_config.model_config, MULTIMODAL_REGISTRY)
@@ -294,6 +295,13 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         return (engine_core_outputs,
                 scheduler_output.total_num_scheduled_tokens > 0)
 
+    def post_step(self, model_executed: bool) -> None:
+        if self.use_spec_decode and model_executed:
+            # Take the draft token ids.
+            draft_token_ids = self.model_executor.take_draft_token_ids()
+            if draft_token_ids is not None:
+                self.scheduler.update_draft_token_ids(draft_token_ids)
+
     def step_with_batch_queue(
             self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]:
         """Schedule and execute batches with the batch queue.
@@ -746,6 +754,8 @@ def _process_engine_step(self) -> bool:
         # Put EngineCoreOutputs into the output queue.
         for output in (outputs.items() if outputs else ()):
             self.output_queue.put_nowait(output)
+        # Post-step hook.
+        self.post_step(model_executed)
 
         return model_executed
 
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 50b9634a49e1..063a5f592e1a 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from concurrent.futures import Future
-from typing import Callable, Union
+from typing import Callable, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -14,7 +14,7 @@
 from vllm.executor.uniproc_executor import (  # noqa
     UniProcExecutor as UniProcExecutorV0)
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 
 FailureCallback = Callable[[], None]
 
@@ -88,6 +88,10 @@ def execute_model(
                                      args=(scheduler_output, ))
         return output[0]
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        output = self.collective_rpc("take_draft_token_ids")
+        return output[0]
+
     @property
     def max_concurrent_batches(self) -> int:
         return 1
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 0db3bcd7fb40..15b88a212899 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -33,7 +33,7 @@
                         get_loopback_ip, get_mp_context, get_open_port,
                         set_process_title)
 from vllm.v1.executor.abstract import Executor, FailureCallback
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -191,6 +191,12 @@ def execute_model(
                 outputs, self.output_rank)
         return self.kv_output_aggregator.aggregate(outputs, self.output_rank)
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        # OPTIMIZATION: Get output only from a single worker (output_rank)
+        outputs = self.collective_rpc("take_draft_token_ids",
+                                      unique_reply_rank=self.output_rank)
+        return outputs[0]
+
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 7d7cd0c94dd0..f8d6b24702f3 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -94,9 +94,6 @@ class ModelRunnerOutput:
     # each request due to speculative/jump decoding.
     sampled_token_ids: list[list[int]]
 
-    # num_reqs x num_spec_tokens
-    spec_token_ids: Optional[list[list[int]]]
-
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs]
@@ -117,10 +114,18 @@ class ModelRunnerOutput:
     num_nans_in_logits: Optional[dict[str, int]] = None
 
 
+@dataclass
+class DraftTokenIds:
+
+    # [num_reqs]
+    req_ids: list[str]
+    # num_reqs x num_draft_tokens
+    draft_token_ids: list[list[int]]
+
+
 EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(req_ids=[],
                                               req_id_to_index={},
                                               sampled_token_ids=[],
-                                              spec_token_ids=None,
                                               logprobs=None,
                                               prompt_logprobs_dict={},
                                               pooler_output=[],
diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index 309fd926aecd..3e90179e78d9 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -38,12 +38,14 @@ def propose(
         self,
         target_hidden_states: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
+    ) -> list[list[int]]:
         # Generate blocks and compute logits
         blocks = self.model(target_hidden_states)
         logits = self.model.compute_logits(blocks, None)
 
         # Get draft tokens and transpose the result
+        # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU
+        # synchronization.
         draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits]
         return [list(row) for row in zip(*draft_tokens)]
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 43119fcad3fb..9b0345a6aa3a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -65,8 +65,8 @@
                                         FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec, MambaSpec,
                                         SlidingWindowSpec)
-from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
-                             ModelRunnerOutput)
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
+                             LogprobsTensors, ModelRunnerOutput)
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -348,6 +348,10 @@ def __init__(
 
         self.reorder_batch_threshold: Optional[int] = None
 
+        # Cached outputs.
+        self._draft_token_ids: Optional[Union[list[list[int]],
+                                              torch.Tensor]] = None
+
     def _init_model_kwargs(self, num_tokens: int):
         model_kwargs = dict[str, Any]()
         num_reqs = self.input_batch.num_reqs
@@ -1493,7 +1497,6 @@ def _pool(
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=[],
-            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=pooler_output,
@@ -1764,12 +1767,9 @@ def execute_model(
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
-        if not self.speculative_config:
-            # Speculative decoding is not enabled.
-            spec_token_ids = None
-        else:
+        if self.speculative_config:
             assert spec_decode_common_attn_metadata is not None
-            spec_token_ids = self.propose_draft_token_ids(
+            self._draft_token_ids = self.propose_draft_token_ids(
                 scheduler_output,
                 valid_sampled_token_ids,
                 sampling_metadata,
@@ -1786,7 +1786,6 @@ def execute_model(
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
-            spec_token_ids=spec_token_ids,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
             pooler_output=[],
@@ -1794,6 +1793,17 @@ def execute_model(
             num_nans_in_logits=num_nans_in_logits,
         )
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        if self._draft_token_ids is None:
+            return None
+        req_ids = self.input_batch.req_ids
+        if isinstance(self._draft_token_ids, torch.Tensor):
+            draft_token_ids = self._draft_token_ids.tolist()
+        else:
+            draft_token_ids = self._draft_token_ids
+        self._draft_token_ids = None
+        return DraftTokenIds(req_ids, draft_token_ids)
+
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
@@ -1804,11 +1814,11 @@ def propose_draft_token_ids(
         aux_hidden_states: Optional[torch.Tensor],
         spec_decode_metadata: Optional[SpecDecodeMetadata],
         common_attn_metadata: CommonAttentionMetadata,
-    ) -> list[list[int]]:
+    ) -> Union[list[list[int]], torch.Tensor]:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if self.speculative_config.method == "ngram":
             assert isinstance(self.drafter, NgramProposer)
-            spec_token_ids = self.propose_ngram_draft_token_ids(
+            draft_token_ids = self.propose_ngram_draft_token_ids(
                 sampled_token_ids)
         elif self.speculative_config.method == "medusa":
             assert isinstance(self.drafter, MedusaProposer)
@@ -1826,7 +1836,7 @@ def propose_draft_token_ids(
                 indices = torch.tensor(indices, device=self.device)
                 hidden_states = sample_hidden_states[indices]
 
-            spec_token_ids = self.drafter.propose(
+            draft_token_ids = self.drafter.propose(
                 target_hidden_states=hidden_states,
                 sampling_metadata=sampling_metadata,
             )
@@ -1897,8 +1907,7 @@ def propose_draft_token_ids(
                 common_attn_metadata=common_attn_metadata,
                 mm_embeds=mm_embeds,
             )
-            spec_token_ids = draft_token_ids.tolist()
-        return spec_token_ids
+        return draft_token_ids
 
     def propose_ngram_draft_token_ids(
         self,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 04de8d36680a..22e639b97d09 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -28,7 +28,8 @@
 from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
-from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
+                             ModelRunnerOutput)
 from vllm.v1.utils import report_usage_stats
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 from vllm.v1.worker.worker_base import WorkerBase
@@ -386,6 +387,9 @@ def execute_model(
         assert isinstance(output, ModelRunnerOutput)
         return output
 
+    def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
+        return self.model_runner.take_draft_token_ids()
+
     def profile(self, is_start: bool = True):
         if self.profiler is None:
             raise RuntimeError("Profiler is not enabled.")
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index af837e4d946e..9196c62377b9 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1145,7 +1145,6 @@ def concat_lists(input_lists):
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
-            spec_token_ids=None,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
             pooler_output=[],

From ba75cb1f5f7d6f71bd36211f4409b88c9b72bb46 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 19 Aug 2025 02:31:38 +0200
Subject: [PATCH 173/233] [Misc] Add @tdoublep as a maintainer of hybrid model
 and Triton-attention related code (#23122)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 .github/CODEOWNERS | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b0dd5e99d4c7..7dce62fc9c7d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,6 +10,7 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/model_executor/layers/mamba @tdoublep
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -25,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb @aarnphm
+/vllm/v1/attention/backends/triton_attn.py @tdoublep
 
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@@ -44,6 +46,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
+/tests/models/language/generation/test_hybrid.py @tdoublep
 
 # Docs
 /docs @hmellor
@@ -72,3 +75,9 @@ mkdocs.yaml @hmellor
 /vllm/model_executor/models/pixtral*.py @patrickvonplaten
 /vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
+
+# Kernels
+/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/attention/ops/triton_unified_attention.py @tdoublep
+
+

From 3600fb9fc8737ef66839596a0001b233f158eb13 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 18 Aug 2025 20:39:01 -0400
Subject: [PATCH 174/233] [CI][V0 Deprecation] Removed V0 Only Chunked Prefill
 and Prefix Caching Tests (#22871)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .buildkite/test-pipeline.yaml                 |  18 --
 .github/CODEOWNERS                            |   1 -
 .../basic_correctness/test_chunked_prefill.py | 296 ------------------
 tests/prefix_caching/__init__.py              |   0
 .../test_disable_sliding_window.py            |  49 ---
 tests/prefix_caching/test_prefix_caching.py   | 231 --------------
 6 files changed, 595 deletions(-)
 delete mode 100644 tests/basic_correctness/test_chunked_prefill.py
 delete mode 100644 tests/prefix_caching/__init__.py
 delete mode 100644 tests/prefix_caching/test_disable_sliding_window.py
 delete mode 100644 tests/prefix_caching/test_prefix_caching.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4fc885785492..0912bc1fd94f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -88,15 +88,6 @@ steps:
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
-- label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_chunked_prefill
-  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
 - label: Core Test # 10min
   mirror_hardwares: [amdexperimental]
   fast_check: true
@@ -295,15 +286,6 @@ steps:
     - python3 offline_inference/basic/score.py
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
-- label: Prefix Caching Test # 9min
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/
-  - tests/prefix_caching
-  commands:
-    - pytest -v -s prefix_caching
-
-
 - label: Platform Tests (CUDA)
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 7dce62fc9c7d..ce9590f02ce7 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -31,7 +31,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
deleted file mode 100644
index 4816b76996fc..000000000000
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-It tests chunked prefill. Chunked prefill can be enabled by
-enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
-prefill requests are chunked.
-
-Run `pytest tests/models/test_chunked_prefill.py`.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import pytest
-
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close, check_outputs_equal
-from ..utils import multi_gpu_test
-
-if TYPE_CHECKING:
-    from .conftest import HfRunner, VllmRunner
-
-MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-3.2-1B-Instruct",
-]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the file.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
-def test_models(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Checks exact match decode between huggingface model and vllm runner with
-    chunked prefill.
-    """
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        max_num_seqs = chunked_prefill_token_size
-        max_num_batched_tokens = chunked_prefill_token_size
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
-def test_models_distributed(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-        if (model == "meta-llama/Llama-3.2-1B-Instruct"
-                and distributed_executor_backend == "ray"):
-            # test Ray Compiled Graph
-            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-
-        dtype = "half"
-        max_tokens = 5
-        chunked_prefill_token_size = 16
-
-        # Add a chunked prefill config.
-        max_num_seqs = min(chunked_prefill_token_size, 256)
-        assert chunked_prefill_token_size != -1
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
-
-        # NOTE: take care of the order. run vLLM first, and then run HF.
-        # vLLM needs a fresh new process without cuda initialization.
-        # if we run HF first, the cuda initialization will be done and it
-        # will hurt multiprocessing backend with
-        # fork method (the default method).
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=2,
-                max_num_seqs=max_num_seqs,
-                enable_chunked_prefill=enable_chunked_prefill,
-                max_num_batched_tokens=max_num_batched_tokens,
-                distributed_executor_backend=distributed_executor_backend,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(
-                example_prompts,
-                max_tokens,
-            )
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize(
-    "kv_cache_dtype,model",
-    [("fp8_e4m3",
-      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
-# Due to low-precision numerical divergence, we only test logprob of 4 tokens
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive to
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="machete_prepack_B isn't supported on ROCm")
-def test_models_with_fp8_kv_cache(
-    vllm_runner: VllmRunner,
-    example_prompts,
-    kv_cache_dtype: str,
-    model: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
-) -> None:
-    """
-    Check output logprobs match between no_chunked_prefill and chunked_prefill
-    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
-    so here we only check chunked prefill.
-    """
-    NUM_LOG_PROBS = 8
-
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    with vllm_runner(
-            model,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            model,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=no_chunked_prefill_outputs,
-        outputs_1_lst=chunked_prefill_outputs,
-        name_0="no_chunked_prefill",
-        name_1="chunked_prefill",
-    )
-
-
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("chunk_size", [30, 32])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("dtype", ["half"])
-def test_with_prefix_caching(
-    vllm_runner: VllmRunner,
-    max_tokens: int,
-    enforce_eager: bool,
-    chunk_size: int,
-    tensor_parallel_size: int,
-    dtype: str,
-) -> None:
-    """
-    Checks exact match decode with and without prefix caching
-    with chunked prefill enabled.
-    """
-    model = "meta-llama/Llama-3.2-1B-Instruct"
-    # The common prompt has 142 tokens with Llama-2 tokenizer.
-    common_prompt = "You are a helpful AI assistant " * 20
-    unique_prompts = [
-        "Question",  # Warmup
-        "Question",  # Fully cached
-        "Another question",  # Partial cached
-    ]
-    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
-
-    max_num_batched_tokens = max_num_seqs = chunk_size
-    outputs = {}  # type: ignore
-    for enable in (True, False):
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=enable,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            outputs[enable] = []
-            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy(
-                    [prompt],
-                    max_tokens,
-                )
-
-    check_outputs_equal(
-        outputs_0_lst=outputs[False],
-        outputs_1_lst=outputs[True],
-        name_0="w/o prefix caching",
-        name_1="with prefix caching",
-    )
diff --git a/tests/prefix_caching/__init__.py b/tests/prefix_caching/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
deleted file mode 100644
index b940ab416e67..000000000000
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching.
-
-Run `pytest tests/prefix_caching/test_prefix_caching.py`.
-"""
-import pytest
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-
-MODEL_LEN_LEN = [
-    # Example models with sliding window.
-    ("bigcode/starcoder2-3b", 4096, 16384),
-    # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
-
-    # Confirm model with sliding window works.
-    # config has "use_sliding_window": false
-    ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
-    # config has no sliding window attribute.
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
-]
-
-
-@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
-def test_disable_sliding_window(model_len_len, ):
-    model, sliding_len, full_len = model_len_len
-    disabled_llm = LLM(model, disable_sliding_window=True)
-    disabled_llm.generate("Hi my name is")
-    model_config = disabled_llm.llm_engine.model_config
-    assert model_config.max_model_len == sliding_len, (
-        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
-        model_config.max_model_len)
-
-    del disabled_llm
-    cleanup_dist_env_and_memory()
-
-    enabled_llm = LLM(model,
-                      enforce_eager=True,
-                      disable_sliding_window=False,
-                      enable_prefix_caching=False)
-    enabled_llm.generate("Hi my name is")
-    model_config = enabled_llm.llm_engine.model_config
-    assert model_config.max_model_len == full_len, (
-        "Max len expected to equal full_len of %s, but got %s", full_len,
-        model_config.max_model_len)
-
-    del enabled_llm
-    cleanup_dist_env_and_memory()
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
deleted file mode 100644
index 5bf6ed957c74..000000000000
--- a/tests/prefix_caching/test_prefix_caching.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching.
-
-Run `pytest tests/prefix_caching/test_prefix_caching.py`.
-"""
-
-from __future__ import annotations
-
-import pytest
-
-from tests.conftest import VllmRunner
-from tests.core.utils import SchedulerProxy, create_dummy_prompt
-from vllm import SamplingParams, TokensPrompt
-from vllm.core.scheduler import Scheduler
-from vllm.engine.llm_engine import LLMEngine
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_outputs_equal
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-UNSTABLE_PROMPT_SEQUENCE = [
-    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
-    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
-    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
-    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
-    ([0] * 588) + ([8] * 1539),
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("cached_position", [0, 1])
-@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
-@pytest.mark.parametrize("block_size", [16])
-def test_mixed_requests(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    backend: str,
-    dtype: str,
-    max_tokens: int,
-    cached_position: int,
-    enable_chunked_prefill: bool,
-    block_size: int,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Test the case when some sequences have the prefix cache hit
-    and the others don't. The cached position determines where
-    the sequence is at among the batch of prefills.
-    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        cached_prompt = example_prompts[cached_position]
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enable_prefix_caching=True,
-                enable_chunked_prefill=enable_chunked_prefill,
-                block_size=block_size,
-        ) as vllm_model:
-            # Run the first prompt so the cache is populated
-            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
-                                                      max_tokens)
-
-            # Run all the promopts
-            greedy_params = SamplingParams(temperature=0.0,
-                                           max_tokens=max_tokens)
-            req_outputs = vllm_model.llm.generate(example_prompts,
-                                                  greedy_params)
-
-            # Verify number of cached tokens
-            for i in range(len(req_outputs)):
-                if i == cached_position:
-                    expected_num_cached_tokens = (
-                        len(req_outputs[i].prompt_token_ids) //
-                        block_size) * block_size
-                else:
-                    expected_num_cached_tokens = 0
-                assert (req_outputs[i].num_cached_tokens ==
-                        expected_num_cached_tokens)
-
-            vllm_outputs = [(
-                output.prompt_token_ids + list(output.outputs[0].token_ids),
-                output.prompt + output.outputs[0].text,
-            ) for output in req_outputs]
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_unstable_prompt_sequence(
-    vllm_runner,
-    backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
-
-        with vllm_runner(
-                "Qwen/Qwen2.5-0.5B-Instruct",
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                max_model_len=4096,
-        ) as vllm_model:
-            for prompt in UNSTABLE_PROMPT_SEQUENCE:
-                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                    SamplingParams(max_tokens=1))
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_fully_cached_prefill_needs_uncached_token(model):
-    block_size = 16
-    max_num_batched_tokens = 16
-    num_output_tokens = 5
-    # Make a vllm engine
-    runner = VllmRunner(
-        model_name=model,
-        gpu_memory_utilization=0.7,
-        enable_chunked_prefill=True,
-        enforce_eager=True,
-        enable_prefix_caching=True,
-        block_size=block_size,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_num_seqs=max_num_batched_tokens,
-    )
-    engine: LLMEngine = runner.llm.llm_engine
-
-    scheduler: Scheduler = SchedulerProxy(engine.scheduler[0])  # type: ignore
-    engine.scheduler[0] = scheduler
-
-    # SeqA
-    seqA_tokens = list(range(2 * block_size))
-    seqA, seq_groupA = create_dummy_prompt(
-        request_id="0",
-        prompt_tokens=seqA_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    scheduler.add_seq_group(seq_groupA)
-
-    assert seqA.data.get_num_computed_tokens() == 0
-
-    # Prefill seqA
-    while not seqA.is_finished():
-        engine.step()
-
-    # seqB
-    seqB_tokens = [t + 1 for t in seqA_tokens]  # shift by 1
-    seqB, seq_groupB = create_dummy_prompt(
-        request_id="1",
-        prompt_tokens=seqB_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    # seqC is the same as seqA
-    seqC, seq_groupC = create_dummy_prompt(
-        request_id="2",
-        prompt_tokens=seqA_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    scheduler.add_seq_group(seq_groupB)
-    scheduler.add_seq_group(seq_groupC)
-
-    # Even seqC is fully cached, it should not be prefilled since we
-    # require at least 1 uncached token.
-    engine.step()
-
-    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-    assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupB.request_id)
-    assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
-            max_num_batched_tokens)
-
-    # When seqB is finished, seqC could be prefilled.
-    while not seqB.is_finished():
-        engine.step()
-        sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-        assert len(sched_out.scheduled_seq_groups) == 1
-        assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-                seq_groupB.request_id)
-
-    engine.step()
-    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-    assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupC.request_id)
-    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
-        seqA_tokens)

From 531a03ca6ead110050273e3ab6289f9076073a57 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 18 Aug 2025 19:54:16 -0700
Subject: [PATCH 175/233] [V0 Deprecation] Remove V0 FlashInfer attention
 backend (#22776)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .../test_basic_correctness.py                 |    9 +-
 tests/compile/test_basic_correctness.py       |    2 +-
 .../e2e/test_correctness_sliding_window.py    |    8 +-
 tests/distributed/test_pp_cudagraph.py        |    1 -
 .../attention/test_attention_selector.py      |    3 +
 tests/models/quantization/test_fp8.py         |    5 +-
 vllm/attention/backends/flashinfer.py         | 1098 -----------------
 vllm/platforms/cuda.py                        |   16 +-
 8 files changed, 9 insertions(+), 1133 deletions(-)
 delete mode 100644 vllm/attention/backends/flashinfer.py

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 13ddf035a55e..a3b09cc81791 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -12,7 +12,6 @@
 import torch
 
 from vllm import LLM, envs
-from vllm.platforms import current_platform
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
 
 from ..conftest import HfRunner, VllmRunner
@@ -78,11 +77,7 @@ def test_models(
             "VLLM_USE_V1") and envs.VLLM_USE_V1:
         pytest.skip("enable_prompt_embeds is not supported in v1.")
 
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-
-    if backend in ("XFORMERS",
-                   "FLASHINFER") and model == "google/gemma-2-2b-it":
+    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
         pytest.skip(
             f"{backend} does not support gemma2 with full context length.")
 
@@ -141,8 +136,6 @@ def test_models(
         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
         ("distilbert/distilgpt2", "ray", "", "A100", {}),
         ("distilbert/distilgpt2", "mp", "", "A100", {}),
-        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
     ])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index cf715cd03222..422cb94b036c 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -34,7 +34,7 @@ class TestSetting:
             model_args=["--max-model-len", "2048"],
             pp_size=2,
             tp_size=2,
-            attn_backend="FLASHINFER",
+            attn_backend="FLASH_ATTN",
             method="generate",
             fullgraph=True,
         ),
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index 4d67eea2264b..27fe27a880e3 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -32,7 +32,7 @@
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
                                   batch_size, seed, backend, monkeypatch):
     """
@@ -43,8 +43,6 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
 
     Additionally, we compare the results of the v1 and v2 managers.
     """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
         pytest.skip("Xformers does not support ROCm/HIP.")
 
@@ -96,7 +94,7 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
 @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
                                         backend, monkeypatch):
     """
@@ -107,8 +105,6 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
     The results with and without chunked prefill are not the same due to
     numerical instabilities.
     """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
         pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index a027a9e37dd6..5ca65a0e8d2c 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -17,7 +17,6 @@
 ])
 @pytest.mark.parametrize("ATTN_BACKEND", [
     "FLASH_ATTN",
-    "FLASHINFER",
 ])
 @create_new_process_for_each_test()
 def test_pp_cudagraph(
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index bfeafaa9e27e..aea166da3af2 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -81,6 +81,9 @@ def test_env(
         m.setenv(STR_BACKEND_ENV_VAR, name)
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
+        if name == "FLASHINFER" and not use_v1:
+            pytest.skip("FlashInfer backend is only available on V1 engine")
+
         if device == "cpu":
             if not use_v1:
                 pytest.skip("CPU backend only supports V1")
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index 10914abf9ad3..afc27b6e0566 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -32,7 +32,7 @@
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
@@ -57,9 +57,6 @@ def test_models(
     numerical sensitive kernels.
     """
 
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-
     if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
         pytest.skip(
             f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
deleted file mode 100644
index a85ec2463283..000000000000
--- a/vllm/attention/backends/flashinfer.py
+++ /dev/null
@@ -1,1098 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from collections import defaultdict
-from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
-
-from vllm.multimodal import MultiModalPlaceholderMap
-
-try:
-    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-    from flashinfer.decode import (CUDAGraphBatchDecodeWithPagedKVCacheWrapper,
-                                   trtllm_batch_decode_with_kv_cache)
-    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
-
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
-except ImportError:
-    # Avoid turning these types into variables during type checking
-    if not TYPE_CHECKING:
-        BatchDecodeWithPagedKVCacheWrapper = None
-        CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
-        BatchPrefillWithPagedKVCacheWrapper = None
-        trtllm_batch_decode_with_kv_cache = None
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
-    raise ImportError("FlashInfer is not installed. Please install it from "
-                      "https://github.com/flashinfer-ai/flashinfer") from None
-
-import torch
-
-import vllm.envs as envs
-from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata,
-                                              AttentionMetadataBuilder,
-                                              AttentionState, AttentionType)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
-                                           compute_slot_mapping_start_idx,
-                                           is_block_tables_empty)
-from vllm.attention.layer import Attention
-from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import VllmConfig, get_layers_from_vllm_config
-from vllm.logger import init_logger
-from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
-                        make_tensor_with_pad)
-from vllm.utils.flashinfer import use_trtllm_attention
-
-logger = init_logger(__name__)
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import ModelInputForGPUBuilder
-
-
-class FlashInferBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "FLASHINFER"
-
-    @staticmethod
-    def get_impl_cls() -> Type["FlashInferImpl"]:
-        return FlashInferImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return FlashInferMetadata
-
-    @staticmethod
-    def get_builder_cls() -> Type["FlashInferMetadataBuilder"]:
-        return FlashInferMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> Type["FlashInferState"]:
-        return FlashInferState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, 2, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def get_kv_cache_stride_order() -> Tuple[int, ...]:
-        cache_layout = FlashInferState.get_kv_cache_layout()
-        assert (cache_layout in ("NHD", "HND"))
-        stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3,
-                                                                      2, 4)
-        return stride_order
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
-
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [64, 128, 256]
-
-    @staticmethod
-    def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
-        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-            return torch.float8_e4m3fn
-        elif kv_cache_dtype == "fp8_e5m2":
-            return torch.float8_e5m2
-        else:
-            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
-
-
-@dataclass
-class PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters.
-    """
-
-    window_left: int
-    logits_soft_cap: Optional[float]
-    sm_scale: float
-
-
-def get_per_layer_parameters(
-        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
-    """
-    Scan all attention layers and determine some hyperparameters
-    to use during `plan`.
-    """
-
-    layers = get_layers_from_vllm_config(vllm_config, Attention)
-    per_layer_params: Dict[str, PerLayerParameters] = {}
-
-    for key, layer in layers.items():
-        impl = layer.impl
-        assert isinstance(impl, FlashInferImpl)
-
-        # Infer hyperparameters from the attention layer
-        window_size = impl.sliding_window
-        window_left = window_size[0] if window_size is not None else -1
-        logits_soft_cap = impl.logits_soft_cap
-        sm_scale = impl.scale
-
-        per_layer_params[key] = PerLayerParameters(window_left,
-                                                   logits_soft_cap, sm_scale)
-
-    return per_layer_params
-
-
-def infer_global_hyperparameters(
-        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters:
-    - `window_left`
-    - `logits_soft_cap`
-    - `sm_scale`
-
-    So this function asserts that all layers share the same values for these
-    hyperparameters and returns the global values.
-    """
-
-    assert len(per_layer_params) > 0, "No attention layers found in the model."
-
-    param_sets = list(per_layer_params.values())
-    global_params = param_sets[0]
-    for params in param_sets:
-        assert params == global_params, (
-            "FlashInfer backend currently only supports models in which all "
-            "layers share the same values for the following hyperparameters: "
-            "`window_left`, `logits_soft_cap`, `sm_scale`.")
-
-    return global_params
-
-
-class FlashInferState(AttentionState):
-
-    def __init__(self, runner):
-        self.runner = runner
-        self._is_graph_capturing = False
-        self._workspace_buffer = None
-        self._decode_wrapper = None
-        self._prefill_wrapper = None
-
-        # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
-
-        self.vllm_config = self.runner.vllm_config
-        self._kv_cache_layout = None
-
-    def _get_workspace_buffer(self):
-        if self._workspace_buffer is None:
-            self._workspace_buffer = torch.zeros(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE,
-                dtype=torch.uint8,
-                device=self.runner.device)
-        return self._workspace_buffer
-
-    @staticmethod
-    def get_kv_cache_layout():
-        from vllm.v1.attention.backends.utils import _KV_CACHE_LAYOUT_OVERRIDE
-        if _KV_CACHE_LAYOUT_OVERRIDE is not None:
-            logger.info_once("Using KV cache layout %s",
-                             _KV_CACHE_LAYOUT_OVERRIDE)
-            return _KV_CACHE_LAYOUT_OVERRIDE
-        cache_layout = envs.VLLM_KV_CACHE_LAYOUT
-        if cache_layout is None:
-            logger.info_once("Using default KV cache layout NHD")
-            return "NHD"
-        logger.info_once("Using KV cache layout %s", cache_layout)
-        return cache_layout
-
-    def _get_prefill_wrapper(self):
-        if self._prefill_wrapper is None:
-            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(), self.get_kv_cache_layout())
-        return self._prefill_wrapper
-
-    def _get_decode_wrapper(self):
-        if self._decode_wrapper is None:
-            num_qo_heads = (self.runner.model_config.get_num_attention_heads(
-                self.runner.parallel_config))
-            num_kv_heads = self.runner.model_config.get_num_kv_heads(
-                self.runner.parallel_config)
-            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
-                num_qo_heads // num_kv_heads > 4)
-            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(),
-                self.get_kv_cache_layout(),
-                use_tensor_cores=use_tensor_cores)
-        return self._decode_wrapper
-
-    @contextmanager
-    def graph_capture(self, max_batch_size: int):
-        self._is_graph_capturing = True
-        self._graph_decode_wrapper = None
-        self._graph_slot_mapping = torch.full((max_batch_size, ),
-                                              PAD_SLOT_ID,
-                                              dtype=torch.long,
-                                              device=self.runner.device)
-        self._graph_seq_lens = torch.ones(max_batch_size,
-                                          dtype=torch.int32,
-                                          device=self.runner.device)
-        self._graph_block_tables = torch.from_numpy(
-            self.runner.graph_block_tables).to(device=self.runner.device)
-        self._graph_decode_workspace_buffer = self._get_workspace_buffer()
-        self._graph_indices_buffer = torch.empty(
-            max_batch_size * self.runner.cache_config.num_gpu_blocks,
-            dtype=torch.int32,
-            device=self.runner.device)
-        self._graph_indptr_buffer = torch.empty(max_batch_size + 1,
-                                                dtype=torch.int32,
-                                                device=self.runner.device)
-        self._graph_last_page_len_buffer = torch.empty(
-            max_batch_size, dtype=torch.int32, device=self.runner.device)
-        yield
-        self._is_graph_capturing = False
-        del self._graph_slot_mapping
-        del self._graph_seq_lens
-        del self._graph_block_tables
-        del self._graph_decode_workspace_buffer
-        del self._graph_indices_buffer
-        del self._graph_indptr_buffer
-        del self._graph_last_page_len_buffer
-        del self._graph_decode_wrapper
-
-    def graph_clone(self, batch_size: int):
-        assert self._is_graph_capturing
-        state = self.__class__(self.runner)
-        state._workspace_buffer = self._graph_decode_workspace_buffer
-        state._decode_wrapper = self._graph_decode_wrapper
-        state._prefill_wrapper = self._get_prefill_wrapper()
-        return state
-
-    def graph_capture_get_metadata_for_batch(
-            self, batch_size: int, is_encoder_decoder_model: bool = False):
-        assert self._is_graph_capturing
-        _indptr_buffer = self._graph_indptr_buffer[:batch_size + 1]
-        _last_page_len_buffer = self._graph_last_page_len_buffer[:batch_size]
-
-        num_qo_heads = (self.runner.model_config.get_num_attention_heads(
-            self.runner.parallel_config))
-        num_kv_heads = self.runner.model_config.get_num_kv_heads(
-            self.runner.parallel_config)
-        use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
-            num_qo_heads // num_kv_heads > 4)
-        self._graph_decode_wrapper = \
-            CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
-            self._graph_decode_workspace_buffer, _indptr_buffer,
-            self._graph_indices_buffer, _last_page_len_buffer,
-            self.get_kv_cache_layout(),
-            use_tensor_cores)
-        if self.runner.kv_cache_dtype.startswith("fp8"):
-            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                self.runner.kv_cache_dtype)
-        else:
-            kv_cache_dtype = get_kv_cache_torch_dtype(
-                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
-
-        paged_kv_indptr_tensor_host = torch.arange(0,
-                                                   batch_size + 1,
-                                                   dtype=torch.int32)
-        paged_kv_indices_tensor_host = torch.arange(0,
-                                                    batch_size,
-                                                    dtype=torch.int32)
-        paged_kv_last_page_len_tensor_host = torch.full((batch_size, ),
-                                                        self.runner.block_size,
-                                                        dtype=torch.int32)
-        query_start_loc_host = torch.arange(0,
-                                            batch_size + 1,
-                                            dtype=torch.int32)
-
-        global_params = infer_global_hyperparameters(
-            get_per_layer_parameters(self.vllm_config))
-
-        attn_metadata = self.runner.attn_backend.make_metadata(
-            num_prefills=0,
-            slot_mapping=self._graph_slot_mapping[:batch_size],
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            num_prefill_tokens=0,
-            num_decode_tokens=batch_size,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=0,
-            seq_lens_tensor=self._graph_seq_lens,
-            block_tables=self._graph_block_tables,
-            paged_kv_indptr=paged_kv_indptr_tensor_host,
-            paged_kv_indices=paged_kv_indices_tensor_host,
-            paged_kv_last_page_len=paged_kv_last_page_len_tensor_host,
-            num_qo_heads=num_qo_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=self.runner.model_config.get_head_size(),
-            page_size=self.runner.block_size,
-            seq_start_loc=None,
-            query_start_loc=query_start_loc_host,
-            device=self.runner.device,
-            data_type=kv_cache_dtype,
-            q_data_type=self.runner.model_config.dtype,
-            use_cuda_graph=True,
-            decode_wrapper=self._graph_decode_wrapper,
-            prefill_wrapper=None,
-            **dataclasses.asdict(global_params),
-        )
-        attn_metadata.begin_forward()
-        return attn_metadata
-
-    def get_graph_input_buffers(self,
-                                attn_metadata,
-                                is_encoder_decoder_model: bool = False):
-        return {
-            "block_tables": attn_metadata.block_tables,
-            "seq_lens_tensor": attn_metadata.seq_lens_tensor,
-            "slot_mapping": attn_metadata.slot_mapping,
-        }
-
-    def prepare_graph_input_buffers(self,
-                                    input_buffers,
-                                    attn_metadata,
-                                    is_encoder_decoder_model: bool = False):
-        # FlashInfer-specific logic: copy additional tensors
-        num_total_blocks = attn_metadata.decode_metadata.seq_lens_tensor.shape[
-            0]
-        input_buffers["seq_lens_tensor"][:num_total_blocks].copy_(
-            attn_metadata.seq_lens_tensor, non_blocking=True)
-        input_buffers["block_tables"][:num_total_blocks].copy_(
-            attn_metadata.block_tables, non_blocking=True)
-
-    def begin_forward(self, model_input):
-        assert not self._is_graph_capturing
-        state = self
-        use_cuda_graph = model_input.attn_metadata.use_cuda_graph
-        is_decode = model_input.attn_metadata.num_prefills == 0
-        # In case of multistep chunked-prefill, there might be prefill requests
-        # scheduled while CUDA graph mode is enabled. We don't run graph in that
-        # case.
-        if use_cuda_graph and is_decode:
-            if model_input.inputs_embeds is None:
-                batch_size = model_input.input_tokens.shape[0]
-                state = (
-                    self.runner.graph_runners[model_input.virtual_engine][(
-                        batch_size, False)].attn_state)
-            else:
-                batch_size = model_input.inputs_embeds.shape[0]
-                state = (
-                    self.runner.graph_runners[model_input.virtual_engine][(
-                        batch_size, True)].attn_state)
-
-        model_input.attn_metadata.prefill_wrapper = state._get_prefill_wrapper(
-        )
-        model_input.attn_metadata.decode_wrapper = state._get_decode_wrapper()
-        model_input.attn_metadata.begin_forward()
-
-
-@dataclass
-class FlashInferMetadata(AttentionMetadata):
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    max_decode_seq_len: int
-
-    # Number of query tokens for each request in the batch.
-    # Currently, we require that all requests have the same number of query
-    # tokens during the decoding phase. When speculavie decoding is enabled,
-    # decode_query_len might be greater than 1. In all other cases, it is 1.
-    decode_query_len: Optional[int] = 1
-
-    use_cuda_graph: bool = True
-
-    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
-    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
-
-    # Metadata for the prefill stage
-    seq_start_loc: Optional[torch.Tensor] = None
-    query_start_loc: Optional[torch.Tensor] = None
-    block_tables: Optional[torch.Tensor] = None
-
-    # used for GPU operations
-    seq_lens_tensor: Optional[torch.Tensor] = None
-    block_table_bound: Optional[torch.Tensor] = None
-
-    # An example for paged_kv_indices, paged_kv_indptr:
-    # request 1, page indices [0, 5, 8]
-    # request 2, page indices [1, 6, 7]
-    # request 3, page indices [3, 4]
-    # paged_kv_indices is a concatenation of page indices of all requests:
-    # [0, 5, 8, 1, 6, 7, 3, 4]
-    # paged_kv_indptr is used to index into paged_kv_indices:
-    # [0, 3, 6, 8]
-    # The indptr of the paged kv cache, shape: [batch_size + 1]
-    paged_kv_indptr: Optional[torch.Tensor] = None
-    # The page indices of the paged kv cache
-    paged_kv_indices: Optional[torch.Tensor] = None
-    # The number of entries in the last page of each request in
-    # the paged kv cache, shape: [batch_size]
-    paged_kv_last_page_len: Optional[torch.Tensor] = None
-    # The number of query/output heads
-    num_qo_heads: Optional[int] = None
-    # The number of key/value heads
-    num_kv_heads: Optional[int] = None
-    # The dimension of the attention heads
-    head_dim: Optional[int] = None
-    # Block size of vllm
-    page_size: Optional[int] = None
-    # The data type of the paged kv cache
-    data_type: torch.dtype = None
-    # The data type of the query
-    q_data_type: torch.dtype = None
-    # FlashInfer 0.2 encourages passing host tensors
-    device: torch.device = torch.device("cpu")
-    is_profile_run: bool = False
-
-    # The FlashInfer backend currently supports only models in which all layers
-    # share the same following hyperparameters:
-
-    # The left (inclusive) window size for the attention window, when
-    # set to `-1`, the window size will be set to the full length of
-    # the sequence. Defaults to `-1`.
-    window_left: int = -1
-    # The attention logits soft capping value (used in Gemini, Grok and
-    # Gemma-2, etc.), if not provided, will be set to `0`. If greater
-    # than 0, the logits will be capped according to formula:
-    # $$\texttt{logits\_soft\_cap} \times
-    # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$,
-    # where $x$ is the input logits.
-    logits_soft_cap: Optional[float] = None
-    # The scale used in softmax, if not provided, will be set to
-    # `1.0 / sqrt(head_dim)`.
-    sm_scale: Optional[float] = None
-
-    def __post_init__(self):
-        # Refer to
-        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
-        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f" received {self.head_dim}.")
-
-    def begin_forward(self):
-        if self.num_prefill_tokens > 0:
-            if self.paged_kv_indices is None:
-                return
-
-            assert self.prefill_wrapper is not None
-            assert self.query_start_loc is not None
-            assert self.paged_kv_indices is not None
-            assert self.paged_kv_indptr is not None
-            assert self.paged_kv_last_page_len is not None
-            assert self.block_table_bound is not None
-            assert self.seq_lens_tensor is not None
-            self.query_start_loc = self.query_start_loc[:self.num_prefills + 1]
-            batch_size = self.query_start_loc.shape[0] - 1
-            assert batch_size >= 0
-            # We will use flash attention for profiling to
-            # determine the number of blocks. Therefore,
-            # we don't need to prepare the input for flashinfer for profile run.
-            if not self.is_profile_run:
-                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                    self.device)
-                self.block_table_bound = self.block_table_bound.to(self.device)
-                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
-                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.prefill_wrapper.plan(
-                    self.query_start_loc,
-                    self.paged_kv_indptr[:self.num_prefills + 1],
-                    self.paged_kv_indices,
-                    self.paged_kv_last_page_len[:self.num_prefills],
-                    self.num_qo_heads,
-                    self.num_kv_heads,
-                    self.head_dim,
-                    self.page_size,
-                    causal=True,
-                    sm_scale=self.sm_scale,
-                    window_left=self.window_left,
-                    logits_soft_cap=self.logits_soft_cap,
-                    q_data_type=self.q_data_type,
-                    kv_data_type=self.data_type)
-        if self.num_decode_tokens > 0:
-            assert self.paged_kv_indices is not None
-            assert self.paged_kv_indptr is not None
-            assert self.paged_kv_last_page_len is not None
-            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                self.device)
-            # handle model warmup path
-            if self.block_table_bound is not None:
-                self.block_table_bound = self.block_table_bound.to(self.device)
-            if self.seq_lens_tensor is not None:
-                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
-
-            assert self.decode_wrapper is not None
-            self.decode_wrapper.plan(
-                self.paged_kv_indptr[self.num_prefills:],
-                self.paged_kv_indices,
-                self.paged_kv_last_page_len[self.num_prefills:],
-                self.num_qo_heads,
-                self.num_kv_heads,
-                self.head_dim,
-                self.page_size,
-                # Disable flashinfer's pos encoding and use vllm's rope.
-                pos_encoding_mode="NONE",
-                window_left=self.window_left,
-                logits_soft_cap=self.logits_soft_cap,
-                sm_scale=self.sm_scale,
-                # kv-cache data type.
-                kv_data_type=self.data_type,
-                # query data type.
-                q_data_type=self.q_data_type)
-
-    def asdict_zerocopy(self,
-                        skip_fields: Optional[Set[str]] = None
-                        ) -> Dict[str, Any]:
-        if skip_fields is None:
-            skip_fields = set()
-        # We need to skip the prefill/decode_wrapper field since it cannot be
-        # broadcasted with nccl when TP is enabled.
-        skip_fields.add('prefill_wrapper')
-        skip_fields.add('decode_wrapper')
-        return super().asdict_zerocopy(skip_fields)
-
-    @property
-    def prefill_metadata(self) -> Optional["FlashInferMetadata"]:
-        if self.num_prefills == 0:
-            return None
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["FlashInferMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        return self
-
-
-class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
-
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
-        # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
-
-        self.vllm_config = self.runner.vllm_config
-
-    def prepare(self):
-        self.slot_mapping: List[int] = []
-        self.prefill_seq_lens: List[int] = []
-        self.context_lens: List[int] = []
-        self.block_tables: List[List[int]] = []
-        self.curr_seq_lens: List[int] = []
-        self.multimodal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-        self.num_prefills = 0
-        self.num_prefill_tokens = 0
-        self.num_decode_tokens = 0
-
-        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
-        # for the precise definition of the following fields.
-        # An example:
-        # request 1, page indices [0, 5, 8]
-        # request 2, page indices [1, 6, 7]
-        # request 3, page indices [3, 4]
-        # paged_kv_indices is a concatenation of page indices of all requests:
-        # [0, 5, 8, 1, 6, 7, 3, 4]
-        # paged_kv_indptr is used to index into paged_kv_indices:
-        # [0, 3, 6, 8]
-        self.paged_kv_indices: List[int] = []
-        # 0 at the beginning of paged_kv_indptr indicates the start of the
-        # first request’s page indices in the paged_kv_indices list.
-        self.paged_kv_indptr: List[int] = [0]
-        # paged_kv_last_page_len is the length of the last page of each request
-        self.paged_kv_last_page_len: List[int] = []
-        self.total_blocks = 0
-        self.is_profile_run: bool = False
-
-        if self.global_hyperparameters is None:
-            # Infer global hyperparameters, since currently we only support
-            # models in which all layers share the same values for the
-            # following hyperparameters:
-            # - `window_left`
-            # - `logits_soft_cap`
-            # - `sm_scale`
-            inferred_params = infer_global_hyperparameters(
-                get_per_layer_parameters(self.vllm_config))
-            self.global_hyperparameters = inferred_params
-            self.window_left = inferred_params.window_left
-            self.logits_soft_cap = inferred_params.logits_soft_cap
-            self.sm_scale = inferred_params.sm_scale
-
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
-        """Add a sequence group to the metadata. Specifically update/append
-        1. context length.
-        2. block table.
-        3. slot mapping.
-        """
-        is_prompt = inter_data.is_prompt
-        block_tables = inter_data.block_tables
-        computed_block_nums = inter_data.computed_block_nums
-
-        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block) in zip(
-                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
-                 inter_data.orig_seq_lens, inter_data.seq_lens,
-                 inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks):
-            self.context_lens.append(context_len)
-            if is_prompt:
-                mm_maps = inter_data.multi_modal_placeholder_maps
-                if mm_maps:
-                    for modality, placeholders in mm_maps.items():
-                        self.multimodal_placeholder_maps[modality].extend(
-                            placeholders)
-                self.num_prefills += 1
-                self.num_prefill_tokens += token_len
-                self.prefill_seq_lens.append(seq_len)
-            else:
-                assert query_len == 1, (
-                    "seq_len: {}, context_len: {}, query_len: {}".format(
-                        seq_len, context_len, query_len))
-                self.num_decode_tokens += query_len
-                self.curr_seq_lens.append(curr_seq_len)
-
-            # Compute block table.
-            # TODO(sang): Combine chunked prefill and prefix caching by
-            # only allowing multiple of block_size chunk size.
-            # NOTE: This only works for oooooooxxx style attention.
-            block_table = []
-            if inter_data.prefix_cache_hit:
-                block_table = computed_block_nums
-            elif ((chunked_prefill_enabled or not is_prompt)
-                  and block_tables is not None):
-                block_table = block_tables[seq_id][-curr_sliding_window_block:]
-            self.block_tables.append(block_table)
-
-            is_profile_run = is_block_tables_empty(block_tables)
-
-            # Compute slot mapping.
-            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
-                                                       context_len,
-                                                       self.sliding_window)
-            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
-                                 seq_len, context_len, start_idx,
-                                 self.block_size, inter_data.block_tables)
-
-            # It is not necessary to add paged_kv_indices, paged_kv_indptr,
-            # and paged_kv_last_page_len for profile run because we will
-            # create dummy inputs.
-            if is_profile_run:
-                self.is_profile_run = is_profile_run
-                return
-
-            block_table = block_tables[seq_id]
-            self._update_paged_kv_tensors(block_table, seq_len)
-
-    def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
-        # Get the number of valid blocks based on sequence length.
-        # If seq_len = 16, block_size = 16,
-        # block_table_bound is 1 with 1 valid block.
-        # If seq_len = 15, block_size = 16,
-        # block_table_bound is 0 + 1 with 1 valid block.
-        self.total_blocks += len(block_table)
-        block_table_bound = seq_len // self.block_size + 1 \
-                            if seq_len % self.block_size != 0 \
-                            else seq_len // self.block_size
-        self.paged_kv_indices.extend(block_table[:block_table_bound])
-        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
-                                    block_table_bound)
-
-        last_page_len = seq_len % self.block_size
-        if last_page_len == 0:
-            last_page_len = self.block_size
-        self.paged_kv_last_page_len.append(last_page_len)
-
-    def build(self, seq_lens: List[int], query_lens: List[int],
-              cuda_graph_pad_size: int, batch_size: int):
-        """Build attention metadata with on-device tensors.
-
-        Args:
-            seq_lens: The maybe padded sequence lengths of the input sequences.
-            query_lens: The query lengths of the input sequences.
-            cuda_graph_pad_size: The padding size for cuda graph.
-                                 -1 if cuda graph is not used.
-            batch_size: The maybe padded batch size.
-        """
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled)
-
-        device = self.runner.device
-        use_captured_graph = cuda_graph_pad_size != -1
-
-        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
-        max_decode_seq_len = max(self.curr_seq_lens, default=0)
-        num_decode_tokens = self.num_decode_tokens
-        decode_query_len = max(query_lens[self.num_prefills:], default=1)
-
-        if use_captured_graph:
-            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size - self.num_prefill_tokens
-
-            # The shape of graph_block_tables is
-            # [max batch size, max context len // block size].
-            input_block_tables = self.runner.graph_block_tables[:batch_size]
-            max_blocks = input_block_tables.shape[1]
-            for i, block_table in enumerate(self.block_tables):
-                if block_table:
-                    num_blocks = len(block_table)
-                    if num_blocks <= max_blocks:
-                        input_block_tables[i, :num_blocks] = block_table
-                    else:
-                        # It may be possible to have more blocks allocated due
-                        # to lookahead slots of multi-step, however, they are
-                        # not used anyway, so can be safely ignored.
-                        input_block_tables[
-                            i, :max_blocks] = block_table[:max_blocks]
-
-            block_tables = torch.from_numpy(input_block_tables).to(
-                device, non_blocking=True)
-
-            last_paged_kv_indptr = self.paged_kv_indptr[-1]
-            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
-                                        cuda_graph_pad_size)
-            self.paged_kv_last_page_len.extend([0] * cuda_graph_pad_size)
-        else:
-            block_tables = make_tensor_with_pad(
-                self.block_tables,
-                pad=0,
-                dtype=torch.int,
-                device=device,
-            )
-
-        assert device is not None
-        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
-                                           self.runner.pin_memory)
-        query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device,
-                                             self.runner.pin_memory)
-        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
-                                               device, self.runner.pin_memory)
-        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=device)
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=device)
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            self.multimodal_placeholder_maps.items()
-        }
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-        torch.cumsum(query_lens_tensor,
-                     dim=0,
-                     dtype=query_start_loc.dtype,
-                     out=query_start_loc[1:])
-
-        if len(self.paged_kv_indptr) > 0:
-            # extend to the maximum number of blocks as returned by the
-            # scheduler
-            self.paged_kv_indices.extend(
-                [0] * (self.total_blocks - len(self.paged_kv_indices)))
-            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
-                                                   device="cpu",
-                                                   dtype=torch.int)
-            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
-                                                  device="cpu",
-                                                  dtype=torch.int)
-            paged_kv_last_page_len_tensor = torch.tensor(
-                self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
-            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
-                                                   1,
-                                                   device="cpu",
-                                                   dtype=torch.int)
-        else:
-            paged_kv_indices_tensor = None
-            paged_kv_indptr_tensor = None
-            paged_kv_last_page_len_tensor = None
-            block_table_bound_tensor = None
-
-        if self.runner.kv_cache_dtype.startswith("fp8"):
-            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                self.runner.kv_cache_dtype)
-        else:
-            kv_cache_dtype = get_kv_cache_torch_dtype(
-                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
-
-        return FlashInferMetadata(
-            decode_query_len=decode_query_len,
-            num_prefills=self.num_prefills,
-            slot_mapping=slot_mapping_tensor,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            max_prefill_seq_len=max_prefill_seq_len,
-            max_decode_seq_len=max_decode_seq_len,
-            block_tables=block_tables,
-            paged_kv_indptr=paged_kv_indptr_tensor,
-            paged_kv_indices=paged_kv_indices_tensor,
-            paged_kv_last_page_len=paged_kv_last_page_len_tensor,
-            block_table_bound=block_table_bound_tensor,
-            seq_lens_tensor=seq_lens_tensor,
-            num_qo_heads=self.runner.model_config.get_num_attention_heads(
-                self.runner.parallel_config),
-            num_kv_heads=self.runner.model_config.get_num_kv_heads(
-                self.runner.parallel_config),
-            head_dim=self.runner.model_config.get_head_size(),
-            page_size=self.block_size,
-            seq_start_loc=seq_start_loc,
-            query_start_loc=query_start_loc,
-            device=device,
-            data_type=kv_cache_dtype,
-            q_data_type=self.runner.model_config.dtype,
-            use_cuda_graph=use_captured_graph,
-            is_profile_run=self.is_profile_run,
-            window_left=self.window_left,
-            logits_soft_cap=self.logits_soft_cap,
-            sm_scale=self.sm_scale,
-        )
-
-
-class FlashInferImpl(AttentionImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0 "
-                                      "FLASHINFER backend.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in FlashInfer is not supported yet, it will fall"
-                " back to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = ((sliding_window - 1,
-                                0) if sliding_window is not None else (-1, -1))
-        self.kv_cache_dtype = kv_cache_dtype
-        self.logits_soft_cap = logits_soft_cap
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashInferImpl")
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: FlashInferMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlashInferImpl")
-
-        # TODO: directly write to output tensor
-        num_heads: int = self.num_heads
-        head_size: int = self.head_size
-        num_kv_heads: int = self.num_kv_heads
-        kv_cache_dtype: str = self.kv_cache_dtype
-        softmax_scale: float = self.scale
-        window_size = self.sliding_window
-        alibi_slopes = self.alibi_slopes
-        logits_soft_cap = self.logits_soft_cap
-
-        num_tokens, hidden_size = query.shape
-        query = query.view(-1, num_heads, head_size)
-        key = key.view(-1, num_kv_heads, head_size)
-        value = value.view(-1, num_kv_heads, head_size)
-
-        if kv_cache.numel() > 0:
-            # Use the same reshape and cache kernel as flash attention.
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                kv_cache[:, 0],
-                kv_cache[:, 1],
-                attn_metadata.slot_mapping.flatten(),
-                kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-            # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
-            # to process the cache when the kv_cache_dtype is fp8
-            if kv_cache_dtype.startswith("fp8"):
-                torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                    kv_cache_dtype)
-                kv_cache = kv_cache.view(torch_dtype)
-
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                    f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                    f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
-        query = query.contiguous(
-        )  # Flashinfer requires query to be contiguous
-        # Query for decode. KV is not needed because it is already cached.
-        # QKV for prefill.
-        decode_query = query[num_prefill_tokens:]
-        query = query[:num_prefill_tokens]
-
-        key = key[:num_prefill_tokens]
-        value = value[:num_prefill_tokens]
-
-        assert query.shape[0] == num_prefill_tokens
-        assert decode_query.shape[0] == num_decode_tokens
-
-        window_left = window_size[0] if window_size is not None else -1
-
-        prefill_output: Optional[torch.Tensor] = None
-        if num_decode_tokens > 0:
-            decode_output = torch.empty(decode_query.shape,
-                                        dtype=decode_query.dtype,
-                                        device=decode_query.device)
-        else:
-            decode_output = None
-        stride_order = FlashInferBackend.get_kv_cache_stride_order()
-        if prefill_meta := attn_metadata.prefill_metadata:
-            # We will use flash attention for prefill
-            # when kv_cache is not provided.
-            # This happens when vllm runs the profiling to
-            # determine the number of blocks.
-            if kv_cache.numel() == 0:
-                prefill_output = flash_attn_varlen_func(
-                    q=query,
-                    k=key,
-                    v=value,
-                    cu_seqlens_q=prefill_meta.seq_start_loc,
-                    cu_seqlens_k=prefill_meta.seq_start_loc,
-                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
-                    softmax_scale=softmax_scale,
-                    causal=True,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                )
-            else:
-                assert prefill_meta is not None
-                assert prefill_meta.prefill_wrapper is not None
-
-                assert prefill_meta.prefill_wrapper._causal
-                assert prefill_meta.prefill_wrapper._window_left == window_left
-                assert prefill_meta.prefill_wrapper._logits_soft_cap == (
-                    logits_soft_cap or 0.0)
-                assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale
-
-                prefill_output = prefill_meta.prefill_wrapper.run(
-                    query,
-                    kv_cache.permute(*stride_order),
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                )
-        if decode_meta := attn_metadata.decode_metadata:
-            assert decode_meta is not None
-            assert decode_meta.decode_wrapper is not None
-
-            assert decode_meta.decode_wrapper._window_left == window_left
-            assert decode_meta.decode_wrapper._logits_soft_cap == (
-                logits_soft_cap or 0.0)
-            assert decode_meta.decode_wrapper._sm_scale == softmax_scale
-            # TODO: @pavanimajety Remove this once the switch happens
-            # inside flashinfer.
-            if not use_trtllm_attention(
-                    num_decode_tokens, attn_metadata.max_decode_seq_len,
-                    kv_cache_dtype, attn_metadata.num_qo_heads,
-                    attn_metadata.num_kv_heads, attn_metadata.head_dim):
-                decode_meta.decode_wrapper.run(
-                    decode_query,
-                    kv_cache.permute(*stride_order),
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=decode_output,
-                )
-            else:
-                workspace_buffer = (
-                    decode_meta.decode_wrapper._float_workspace_buffer)
-                assert FlashInferState.get_kv_cache_layout() == "HND"
-                trtllm_batch_decode_with_kv_cache(
-                    query=decode_query,
-                    kv_cache=kv_cache.permute(*stride_order),
-                    workspace_buffer=workspace_buffer,
-                    block_tables=attn_metadata.block_tables,
-                    seq_lens=decode_meta.seq_lens_tensor,
-                    max_seq_len=attn_metadata.max_decode_seq_len,
-                    bmm1_scale=layer._k_scale_float * softmax_scale,
-                    bmm2_scale=layer._v_scale_float,
-                    out=decode_output,
-                )
-
-        if prefill_output is None and decode_output is not None:
-            # Decode only batch.
-            output, num_tokens = decode_output, num_decode_tokens
-        elif decode_output is None and prefill_output is not None:
-            # Prefill only batch.
-            output, num_tokens = prefill_output, num_prefill_tokens
-        else:
-            # Chunked prefill batch does not work with speculative decoding in
-            # FlashInfer backend, so the query length for decode should be 1.
-            assert prefill_output is not None
-            assert decode_output is not None
-            assert decode_meta is not None
-            assert decode_meta.decode_query_len == 1
-            decode_output = decode_output.squeeze(1)
-            output = torch.cat([prefill_output, decode_output], dim=0)
-        return output.view(num_tokens, hidden_size)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 321db8287c0f..55d7afeef63f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -350,17 +350,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             return FLEX_ATTENTION_V1
 
         # Backends for V0 engine
-        if selected_backend == _Backend.FLASHINFER:
-            logger.info("Using FlashInfer backend.")
-            if cls.has_device_capability(100):
-                from vllm.v1.attention.backends.utils import (
-                    set_kv_cache_layout)
-                logger.info_once(
-                    "Using HND KV cache layout on V1 engine by default for "
-                    "Blackwell (SM 10.0) GPUs.")
-                set_kv_cache_layout("HND")
-            return "vllm.attention.backends.flashinfer.FlashInferBackend"
-        elif selected_backend == _Backend.XFORMERS:
+        if selected_backend == _Backend.XFORMERS:
             logger.info("Using XFormers backend.")
             return "vllm.attention.backends.xformers.XFormersBackend"
         elif selected_backend == _Backend.DUAL_CHUNK_FLASH_ATTN:
@@ -416,10 +406,6 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                 if (fp8_kv_cache and not flash_attn_supports_fp8()):
                     logger.info(
                         "Cannot use FlashAttention backend for FP8 KV cache.")
-                    logger.warning(
-                        "Please use FlashInfer backend with FP8 KV Cache for "
-                        "better performance by setting environment variable "
-                        "VLLM_ATTENTION_BACKEND=FLASHINFER")
                     target_backend = _Backend.XFORMERS
             except ImportError:
                 logger.info(

From e11cc337c3eefb8ec75b84a78c3d7a65211e6b20 Mon Sep 17 00:00:00 2001
From: Xiao <xiszishu@gmail.com>
Date: Mon, 18 Aug 2025 20:08:05 -0700
Subject: [PATCH 176/233] chore: disable enable_cpp_symbolic_shape_guards
 (#23048)

Signed-off-by: Xiao Liu <xiszishu@gmail.com>
---
 vllm/compilation/decorators.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 1370862d580a..58f70ef9ef0a 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -267,8 +267,24 @@ def patched_inline_call(parent, func, args, kwargs):
                     code.co_filename)
                 return inline_call(parent, func, args, kwargs)
 
+            # Disable the C++ compilation of symbolic shape guards. C++-fication
+            # of symbolic shape guards can improve guard overhead. But, since
+            # vllm skip guards anyways, setting this flag to False can improve
+            # compile time.
+            dynamo_config_patches = {}
+            try:
+                _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
+                dynamo_config_patches[
+                    "enable_cpp_symbolic_shape_guards"] = False
+            except AttributeError:
+                # Note: this config is not available in torch 2.6, we can skip
+                # if the config doesn't exist
+                logger.debug(
+                    "enable_cpp_symbolic_shape_guards config not available")
+
             with patch.object(InliningInstructionTranslator, 'inline_call',
-                              patched_inline_call):
+                              patched_inline_call), torch._dynamo.config.patch(
+                                  **dynamo_config_patches):
                 output = self.compiled_callable(*args, **kwargs)
             return output
 

From b18be8197669a483d666efce587551c34a201baf Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Mon, 18 Aug 2025 20:46:42 -0700
Subject: [PATCH 177/233] [TPU] make ptxla not imported when using tpu_commons
 (#23081)

Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
---
 .../device_communicators/tpu_communicator.py  | 27 +++---
 .../layers/fused_moe/moe_pallas.py            |  2 +-
 .../model_loader/default_loader.py            | 21 ++--
 vllm/platforms/tpu.py                         |  3 +
 vllm/v1/attention/backends/pallas.py          | 97 ++++++++++---------
 vllm/v1/worker/tpu_worker.py                  | 22 +++--
 6 files changed, 94 insertions(+), 78 deletions(-)

diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index c60a7a7eb25c..942dd67f065d 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -10,6 +10,7 @@
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.platforms.tpu import USE_TPU_COMMONS
 
 from .base_device_communicator import DeviceCommunicatorBase
 
@@ -18,16 +19,17 @@
 
 logger = init_logger(__name__)
 
-if current_platform.is_tpu():
-    import torch_xla
-    import torch_xla.core.xla_model as xm
-    import torch_xla.runtime as xr
-    from torch_xla._internal import pjrt
-    from torch_xla.distributed.xla_multiprocessing import (
-        create_optimized_replica_groups)
-
-    if USE_RAY:
-        from vllm.executor import ray_utils
+if not USE_TPU_COMMONS:
+    logger.info("tpu_commons not found, using vLLM's TpuCommunicator")
+    if current_platform.is_tpu():
+        import torch_xla
+        import torch_xla.core.xla_model as xm
+        import torch_xla.runtime as xr
+        from torch_xla._internal import pjrt
+        from torch_xla.distributed.xla_multiprocessing import (
+            create_optimized_replica_groups)
+        if USE_RAY:
+            from vllm.executor import ray_utils
 
 
 class TpuCommunicator(DeviceCommunicatorBase):
@@ -94,10 +96,7 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         return xm.all_gather(input_, dim=dim)
 
 
-try:
+if USE_TPU_COMMONS:
     from tpu_commons.distributed.device_communicators import (
         TpuCommunicator as TpuCommonsCommunicator)
     TpuCommunicator = TpuCommonsCommunicator  # type: ignore
-except ImportError:
-    logger.info("tpu_commons not found, using vLLM's TpuCommunicator")
-    pass
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index d35bd0098b3c..582ae3e12c28 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn.functional as F
-import torch_xla.experimental.custom_kernel  # noqa: F401
 
 
 def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
@@ -41,6 +40,7 @@ def fused_moe(
         gating_output: [*, num_experts]
     """
     assert expert_map is None, "expert_map is not supported for pallas MoE."
+    import torch_xla.experimental.custom_kernel  # noqa: F401
     orig_shape = hidden_states.shape
     hidden_size = hidden_states.shape[-1]
     num_tokens = hidden_states.shape[:-1].numel()
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 2b8e4427591c..34b8d8e4ed62 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -207,16 +207,21 @@ def _get_weights_iterator(
             )
 
         if current_platform.is_tpu():
-            # In PyTorch XLA, we should call `xm.mark_step` frequently so that
-            # not too many ops are accumulated in the XLA program.
-            import torch_xla.core.xla_model as xm
+            from vllm.platforms.tpu import USE_TPU_COMMONS
 
-            def _xla_weights_iterator(iterator: Generator):
-                for weights in iterator:
-                    yield weights
-                    xm.mark_step()
+            if not USE_TPU_COMMONS:
+                # In PyTorch XLA, we should call `xm.mark_step`
+                # requently so that not too many ops are accumulated
+                # in the XLA program. import torch_xla.core.xla_model
+                # as xm
+                import torch_xla.core.xla_model as xm
 
-            weights_iterator = _xla_weights_iterator(weights_iterator)
+                def _xla_weights_iterator(iterator: Generator):
+                    for weights in iterator:
+                        yield weights
+                        xm.mark_step()
+
+                weights_iterator = _xla_weights_iterator(weights_iterator)
 
         if self.counter_before_loading_weights == 0.0:
             self.counter_before_loading_weights = time.perf_counter()
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index ba06abd07f08..dc2be5c25090 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -24,6 +24,8 @@
 
 logger = init_logger(__name__)
 
+USE_TPU_COMMONS = False
+
 
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
@@ -201,6 +203,7 @@ def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str) -> bool:
 try:
     from tpu_commons.platforms import TpuPlatform as TpuCommonsPlatform
     TpuPlatform = TpuCommonsPlatform  # type: ignore
+    USE_TPU_COMMONS = True
 except ImportError:
     logger.info("tpu_commons not found, using vLLM's TpuPlatform")
     pass
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 9b122136afb7..3eb4a0e7a574 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -5,12 +5,6 @@
 from typing import Optional
 
 import torch
-import torch_xla.core.xla_builder as xb
-import torch_xla.experimental.custom_kernel  # noqa: F401
-# Required to register custom ops.
-from torch.library import impl
-from torch_xla._internal.jax_workarounds import requires_jax
-from torch_xla.experimental.custom_kernel import XLA_LIB
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer, AttentionType)
@@ -37,6 +31,57 @@
     "uint8": torch.uint8,
 }
 
+try:
+    import tpu_commons  # noqa: F401
+except ImportError:
+    # Lazy import torch_xla
+    import torch_xla.core.xla_builder as xb
+    import torch_xla.experimental.custom_kernel  # noqa: F401
+    from torch.library import impl
+    from torch_xla._internal.jax_workarounds import requires_jax
+    from torch_xla.experimental.custom_kernel import XLA_LIB
+
+    @requires_jax
+    def kv_cache_update_op_impl(kv: torch.Tensor, slot_mapping: torch.Tensor,
+                                kv_cache: torch.Tensor,
+                                num_kv_update_slices: torch.Tensor,
+                                page_size: int, num_slices_per_block: int):
+        from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
+        new_kv_cache = xb.call_jax(
+            kv_cache_update,
+            (kv, slot_mapping, kv_cache, num_kv_update_slices), {
+                "page_size": page_size,
+                "num_slices_per_block": num_slices_per_block
+            })
+        return new_kv_cache
+
+
+    XLA_LIB.define(
+        "kv_cache_update_op(Tensor kv, Tensor slot_mapping," \
+        "Tensor kv_cache, Tensor num_kv_update_slices, int page_size," \
+        "int num_slices_per_block)" \
+        "-> Tensor", )
+
+    @impl(XLA_LIB, "kv_cache_update_op", "XLA")
+    def kv_cache_update_op_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
+                               kv_cache: torch.Tensor,
+                               num_kv_update_slices: torch.Tensor,
+                               page_size: int,
+                               num_slices_per_block: int) -> torch.Tensor:
+        new_kv_cache = kv_cache_update_op_impl(kv, slot_mapping, kv_cache,
+                                               num_kv_update_slices, page_size,
+                                               num_slices_per_block)
+        return new_kv_cache
+
+    @impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
+    def kv_cache_update_op_non_xla(kv: torch.Tensor,
+                                   slot_mapping: torch.Tensor,
+                                   kv_cache: torch.Tensor,
+                                   num_kv_update_slices: torch.Tensor,
+                                   page_size: int,
+                                   num_slices_per_block: int) -> torch.Tensor:
+        return kv_cache
+
 
 class PallasAttentionBackend(AttentionBackend):
 
@@ -313,46 +358,6 @@ def write_to_kv_cache(
     kv_cache.copy_(new_kv_cache)
 
 
-@requires_jax
-def kv_cache_update_op_impl(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                            kv_cache: torch.Tensor,
-                            num_kv_update_slices: torch.Tensor, page_size: int,
-                            num_slices_per_block: int):
-    from vllm.attention.ops.pallas_kv_cache_update import kv_cache_update
-    new_kv_cache = xb.call_jax(
-        kv_cache_update, (kv, slot_mapping, kv_cache, num_kv_update_slices), {
-            "page_size": page_size,
-            "num_slices_per_block": num_slices_per_block
-        })
-    return new_kv_cache
-
-
-XLA_LIB.define(
-    "kv_cache_update_op(Tensor kv, Tensor slot_mapping, Tensor kv_cache," \
-    "Tensor num_kv_update_slices, int page_size, int num_slices_per_block)" \
-    "-> Tensor", )
-
-
-@impl(XLA_LIB, "kv_cache_update_op", "XLA")
-def kv_cache_update_op_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                           kv_cache: torch.Tensor,
-                           num_kv_update_slices: torch.Tensor, page_size: int,
-                           num_slices_per_block: int) -> torch.Tensor:
-    new_kv_cache = kv_cache_update_op_impl(kv, slot_mapping, kv_cache,
-                                           num_kv_update_slices, page_size,
-                                           num_slices_per_block)
-    return new_kv_cache
-
-
-@impl(XLA_LIB, "kv_cache_update_op", "CompositeExplicitAutograd")
-def kv_cache_update_op_non_xla(kv: torch.Tensor, slot_mapping: torch.Tensor,
-                               kv_cache: torch.Tensor,
-                               num_kv_update_slices: torch.Tensor,
-                               page_size: int,
-                               num_slices_per_block: int) -> torch.Tensor:
-    return kv_cache
-
-
 # We can move this function to a common utils file if it's also useful for other
 # hardware.
 def dtype_bits(dtype: torch.dtype):
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 72e0e4230a01..9adf8a14213f 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A TPU worker class."""
+
 import os
 from typing import Any, Optional
 
 import torch
 import torch.distributed
 import torch.nn as nn
-import torch_xla.core.xla_model as xm
-import torch_xla.debug.profiler as xp
-import torch_xla.runtime as xr
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -21,19 +19,27 @@
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
+from vllm.platforms.tpu import USE_TPU_COMMONS
 from vllm.tasks import SupportedTask
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
-from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import report_usage_stats
-from vllm.v1.worker.tpu_model_runner import TPUModelRunner
 from vllm.v1.worker.utils import bind_kv_cache
 
 logger = init_logger(__name__)
 
+if not USE_TPU_COMMONS:
+    logger.info("tpu_commons not found, using vLLM's TPUWorker.")
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.profiler as xp
+    import torch_xla.runtime as xr
+
+    from vllm.v1.attention.backends.pallas import TPU_HEAD_SIZE_ALIGNMENT
+    from vllm.v1.worker.tpu_model_runner import TPUModelRunner
+
 
 class TPUWorker:
 
@@ -325,9 +331,7 @@ def _init_tpu_worker_distributed_environment(
         ensure_kv_transfer_initialized(vllm_config)
 
 
-try:
+if USE_TPU_COMMONS:
     from tpu_commons.worker import TPUWorker as TPUCommonsWorker
+
     TPUWorker = TPUCommonsWorker  # type: ignore
-except ImportError:
-    logger.info("tpu_commons not found, using vLLM's TPUWorker.")
-    pass

From 7bdd05db756be5936bee6cda1a42892aa1d2f3c6 Mon Sep 17 00:00:00 2001
From: Nikhil Suryawanshi <77109245+nikheal2@users.noreply.github.com>
Date: Tue, 19 Aug 2025 10:10:37 +0530
Subject: [PATCH 178/233] [Hardware][IBM Z]Enable v1 for s390x and s390x
 dockerfile fixes (#22725)

Signed-off-by: Nikhil Suryawanshi <suryawanshin74@gmail.com>
---
 docker/Dockerfile.s390x      | 87 ++++++++++++++++++++++++++++++++----
 requirements/common.txt      |  3 +-
 requirements/cpu.txt         |  4 +-
 vllm/engine/arg_utils.py     |  7 +--
 vllm/platforms/cpu.py        |  5 ++-
 vllm/platforms/interface.py  |  3 ++
 vllm/v1/worker/cpu_worker.py |  5 ++-
 7 files changed, 96 insertions(+), 18 deletions(-)

diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 4e89bb3057c5..9270b48c54d4 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
 RUN microdnf install -y \
     which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
     libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
-    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \
     microdnf clean all
 
 # Python Installation
@@ -136,6 +136,71 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     mkdir -p /tmp/hf-xet/dist && \
     cp dist/*.whl /tmp/hf-xet/dist/
 
+# Build numba
+FROM python-install AS numba-builder
+
+ARG MAX_JOBS
+ARG NUMBA_VERSION=0.61.2
+
+WORKDIR /tmp
+
+# Clone all required dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    microdnf install ninja-build gcc gcc-c++ -y && \
+    git clone --recursive https://github.com/llvm/llvm-project.git -b llvmorg-15.0.7  && \
+    git clone --recursive https://github.com/numba/llvmlite.git -b v0.44.0 && \
+    git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
+    cd llvm-project && mkdir build && cd  build && \
+    uv pip install 'cmake<4' setuptools numpy && \
+    export PREFIX=/usr/local && CMAKE_ARGS="${CMAKE_ARGS} -DLLVM_ENABLE_PROJECTS=lld;libunwind;compiler-rt" \
+    CFLAGS="$(echo $CFLAGS | sed 's/-fno-plt //g')" \
+    CXXFLAGS="$(echo $CXXFLAGS | sed 's/-fno-plt //g')" \
+    CMAKE_ARGS="${CMAKE_ARGS} -DFFI_INCLUDE_DIR=$PREFIX/include" \
+    CMAKE_ARGS="${CMAKE_ARGS} -DFFI_LIBRARY_DIR=$PREFIX/lib" \
+    cmake -DCMAKE_INSTALL_PREFIX="${PREFIX}"               \
+        -DCMAKE_BUILD_TYPE=Release                       \
+        -DCMAKE_LIBRARY_PATH="${PREFIX}"                 \
+        -DLLVM_ENABLE_LIBEDIT=OFF                        \
+        -DLLVM_ENABLE_LIBXML2=OFF                        \
+        -DLLVM_ENABLE_RTTI=ON                            \
+        -DLLVM_ENABLE_TERMINFO=OFF                       \
+        -DLLVM_INCLUDE_BENCHMARKS=OFF                    \
+        -DLLVM_INCLUDE_DOCS=OFF                          \
+        -DLLVM_INCLUDE_EXAMPLES=OFF                      \
+        -DLLVM_INCLUDE_GO_TESTS=OFF                      \
+        -DLLVM_INCLUDE_TESTS=OFF                         \
+        -DLLVM_INCLUDE_UTILS=ON                          \
+        -DLLVM_INSTALL_UTILS=ON                          \
+        -DLLVM_UTILS_INSTALL_DIR=libexec/llvm            \
+        -DLLVM_BUILD_LLVM_DYLIB=OFF                      \
+        -DLLVM_LINK_LLVM_DYLIB=OFF                       \
+        -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=WebAssembly \
+        -DLLVM_ENABLE_FFI=ON                             \
+        -DLLVM_ENABLE_Z3_SOLVER=OFF                      \
+        -DLLVM_OPTIMIZED_TABLEGEN=ON                     \
+        -DCMAKE_POLICY_DEFAULT_CMP0111=NEW               \
+        -DCOMPILER_RT_BUILD_BUILTINS=ON                  \
+        -DCOMPILER_RT_BUILTINS_HIDE_SYMBOLS=OFF          \
+        -DCOMPILER_RT_BUILD_LIBFUZZER=OFF                \
+        -DCOMPILER_RT_BUILD_CRT=OFF                      \
+        -DCOMPILER_RT_BUILD_MEMPROF=OFF                  \
+        -DCOMPILER_RT_BUILD_PROFILE=OFF                  \
+        -DCOMPILER_RT_BUILD_SANITIZERS=OFF               \
+        -DCOMPILER_RT_BUILD_XRAY=OFF                     \
+        -DCOMPILER_RT_BUILD_GWP_ASAN=OFF                 \
+        -DCOMPILER_RT_BUILD_ORC=OFF                      \
+        -DCOMPILER_RT_INCLUDE_TESTS=OFF                  \
+        ${CMAKE_ARGS} -GNinja ../llvm                    \
+
+    && ninja install  . && \
+    #  build llvmlite
+    cd ../../llvmlite && python setup.py bdist_wheel && \
+    cd ../numba && \
+    if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
+       sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
+    fi && python setup.py bdist_wheel
+
+
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
@@ -163,23 +228,30 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
     --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
     --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
+    --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
+    --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
      sed -i '/^torch/d' requirements/build.txt && \
-     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
-     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
-     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
-     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
+     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
+     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
+     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
+     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
+     LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
+     NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
     uv pip install -v \    
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
         $HF_XET_WHL_FILE \
         $TORCH_WHL_FILE \
+        $LLVM_WHL_FILE \
+        $NUMBA_WHL_FILE \
         --index-strategy unsafe-best-match \
         -r requirements/build.txt \
-        -r requirements/cpu.txt 
+        -r requirements/cpu.txt
+
 
 # Build and install vllm
 RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+    VLLM_TARGET_DEVICE=cpu VLLM_CPU_MOE_PREPACK=0 python setup.py bdist_wheel && \
     uv pip install "$(echo dist/*.whl)[tensorizer]"
 
 # setup non-root user for vllm
@@ -196,4 +268,3 @@ WORKDIR /home/vllm
 
 # Set the default entrypoint
 ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
-
diff --git a/requirements/common.txt b/requirements/common.txt
index 1a8fea0dd7d9..6bc71df24f0e 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -20,7 +20,8 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
-outlines_core == 0.2.10
+outlines_core == 0.2.10 ; platform_machine != "s390x"
+outlines == 0.1.11 ; platform_machine == "s390x"
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 6860275acab6..f4b95b72898c 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -1,8 +1,8 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
-numba == 0.61.2; python_version > '3.9'
+numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
 
 # Dependencies for CPUs
 packaging>=24.2
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6fc894827c4a..679905aed9ec 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1076,12 +1076,13 @@ def create_engine_config(
         # Set default arguments for V0 or V1 Engine.
         if use_v1:
             self._set_default_args_v1(usage_context, model_config)
-            # Disable chunked prefill for POWER (ppc64le)/ARM CPUs in V1
+            # Disable chunked prefill for POWER (ppc64le)/ARM/s390x CPUs in V1
             if current_platform.is_cpu(
             ) and current_platform.get_cpu_architecture() in (
-                    CpuArchEnum.POWERPC, CpuArchEnum.ARM):
+                    CpuArchEnum.POWERPC, CpuArchEnum.S390X, CpuArchEnum.ARM):
                 logger.info(
-                    "Chunked prefill is not supported for ARM and POWER CPUs; "
+                    "Chunked prefill is not supported for ARM and POWER "
+                    "and S390X CPUs; "
                     "disabling it for V1 backend.")
                 self.enable_chunked_prefill = False
         else:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index fe258f76b9d7..c748595a7153 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -332,5 +332,6 @@ def default_v1(cls, model_config) -> bool:
         supplied model configuration.
         """
         arch = cls.get_cpu_architecture()
-        return (cls.supports_v1(model_config) and arch
-                in (CpuArchEnum.X86, CpuArchEnum.POWERPC, CpuArchEnum.ARM))
+        return (cls.supports_v1(model_config)
+                and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC,
+                             CpuArchEnum.ARM, CpuArchEnum.S390X))
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 4017f1ca7eec..40334375b83a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -81,6 +81,7 @@ class CpuArchEnum(enum.Enum):
     X86 = enum.auto()
     ARM = enum.auto()
     POWERPC = enum.auto()
+    S390X = enum.auto()
     OTHER = enum.auto()
     UNKNOWN = enum.auto()
 
@@ -377,6 +378,8 @@ def get_cpu_architecture(cls) -> CpuArchEnum:
             return CpuArchEnum.ARM
         elif machine.startswith("ppc"):
             return CpuArchEnum.POWERPC
+        elif machine == "s390x":
+            return CpuArchEnum.S390X
 
         return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index f83d6804840e..be78597926e0 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -43,8 +43,9 @@ def init_device(self):
         # Setup OpenMP threads affinity.
         omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
         if omp_cpuids == "auto" and platform.system() == "Linux":
-            if current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC:
-                # For POWERPC SMT-8/4/2
+            cpu_arch = current_platform.get_cpu_architecture()
+            if cpu_arch in (CpuArchEnum.POWERPC, CpuArchEnum.S390X):
+                # For S390X/POWERPC SMT-8/4/2
                 self.local_omp_cpuid = self._get_autobind_cpu_ids(
                     lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4])
             elif current_platform.get_cpu_architecture() == CpuArchEnum.X86:

From ceed16df0e5b9a4e6cbbfcb91f634d4f32b127de Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Mon, 18 Aug 2025 22:48:26 -0700
Subject: [PATCH 179/233] Migrate InternVLImagePixelInputs (in nemotron_vl.py)
 to TensorSchema (#22023)

Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/nemotron_vl.py | 28 ++++-------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 82bcd064624f..a9c7d8044e10 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -458,27 +458,6 @@ def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
         vit_embeds = self.mlp1(vit_embeds)
         return vit_embeds
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        #use force_image_size to get image_size
-        h = w = self.config.force_image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
@@ -516,9 +495,12 @@ def _parse_and_validate_image_input(
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
-                pixel_values_flat=self._validate_pixel_values(
-                    pixel_values_flat),
+                pixel_values_flat=pixel_values_flat,
                 num_patches=image_num_patches,
+                resolve_bindings={
+                    "h": self.config.force_image_size,
+                    "w": self.config.force_image_size
+                },
             )
 
         raise AssertionError("This line should be unreachable.")

From 4bb9728e153f39514f38bbfa1919d1acf316965c Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Tue, 19 Aug 2025 02:24:16 -0400
Subject: [PATCH 180/233] [Log] Warning Once for Cutlass MLA  (#23137)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
---
 vllm/v1/attention/backends/mla/cutlass_mla.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 6e1e5d6533da..6937ce10ac15 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -115,7 +115,7 @@ def __init__(
         self._use_old_cutlass_mla = False
         force_old_cutlass = os.environ.get("FORCE_OLD_CUTLASS_MLA", None)
         if force_old_cutlass:
-            logger.warning("Forcing old cutlass mla kernel")
+            logger.warning_once("Forcing old cutlass mla kernel")
             self._use_old_cutlass_mla = True
 
         # TODO: Currently, num_kv_splits is limited to 16 to avoid hanging
@@ -123,8 +123,8 @@ def __init__(
         #       FORCE_NUM_KV_SPLITS=1
         force_num_kv_splits = os.environ.get("FORCE_NUM_KV_SPLITS", None)
         if force_num_kv_splits:
-            logger.warning("Forcing num_kv_splits to %d",
-                           int(force_num_kv_splits))
+            logger.warning_once("Forcing num_kv_splits to %d",
+                                int(force_num_kv_splits))
             self._num_kv_splits = int(force_num_kv_splits)
         else:
             self._num_kv_splits = -1  # => Auto-detect

From 7ab967305d31c0a97f3f55840af51970d624aaf4 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Tue, 19 Aug 2025 14:24:31 +0800
Subject: [PATCH 181/233] [Model] Support Pipeline Parallelism for
 moonshotai/Kimi-VL-A3B-Thinking-2506 (#23114)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md       |  2 +-
 vllm/model_executor/models/kimi_vl.py | 29 ++++++++++++++++-----------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a514572945c3..bfab5713c742 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -626,7 +626,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
-| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
+| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | ✅︎ |
 | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index cbf0008a884b..a08a9a62a57c 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -54,8 +54,7 @@
 from transformers.activations import GELUActivation
 
 from vllm.config import VllmConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
-                              get_tensor_model_parallel_world_size)
+from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -63,7 +62,8 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.models.deepseek_v2 import DeepseekV2Model
-from vllm.model_executor.models.interfaces import SupportsMultiModal
+from vllm.model_executor.models.interfaces import (SupportsMultiModal,
+                                                   SupportsPP)
 from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -81,7 +81,7 @@
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
-from .utils import is_pp_missing_parameter, maybe_prefix
+from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix
 
 
 # For dummy input only
@@ -270,7 +270,8 @@ def get_replacement(item_idx: int):
 @MULTIMODAL_REGISTRY.register_processor(KimiVLMultiModalProcessor,
                                         info=KimiVLProcessingInfo,
                                         dummy_inputs=KimiVLDummyInputsBuilder)
-class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal):
+class KimiVLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsPP):
 
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
@@ -304,17 +305,21 @@ def __init__(
             prefix=maybe_prefix(prefix, "language_model"),
         )
         self.unpadded_vocab_size = config.text_config.vocab_size
-        self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
-            config.text_config.hidden_size,
-            org_num_embeddings=self.config.text_config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE)
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.text_config.hidden_size,
+                org_num_embeddings=self.config.text_config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
         self.media_placeholder: int = self.config.media_placeholder_token_id
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_world_size = get_tensor_model_parallel_world_size()
 
     # ref: qwen2_vl.py
     def _validate_and_reshape_mm_tensor(self, mm_input: object,

From fadc4ffb80497dfdd3bb05814851413c043ecfbd Mon Sep 17 00:00:00 2001
From: Grace Ho <146482179+gracehonv@users.noreply.github.com>
Date: Tue, 19 Aug 2025 00:44:53 -0700
Subject: [PATCH 182/233] [misc] split engine_model into json file for nsys
 profile tool (#23117)

Signed-off-by: Grace Ho <grho@nvidia.com>
Signed-off-by: Grace Ho <146482179+gracehonv@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tools/profiler/nsys_profile_tools/README.md   |  53 +++--
 .../nsys_profile_tools/gputrc2graph.py        | 205 ++++--------------
 .../nsys_profile_tools/vllm_engine_model.json |  63 ++++++
 3 files changed, 135 insertions(+), 186 deletions(-)
 create mode 100644 tools/profiler/nsys_profile_tools/vllm_engine_model.json

diff --git a/tools/profiler/nsys_profile_tools/README.md b/tools/profiler/nsys_profile_tools/README.md
index 75ae0811cc54..9577efb68fb4 100644
--- a/tools/profiler/nsys_profile_tools/README.md
+++ b/tools/profiler/nsys_profile_tools/README.md
@@ -36,8 +36,7 @@ profiling and analyzing nsys profile output.
 ## Notes
 
 - Make sure you have pandas installed.
-- Make sure nsys is installed, and specify the path to the `nsys` command with
-  `--nsys_cmd` if it is not in your PATH.
+- Make sure [nsys](https://developer.nvidia.com/nsight-systems/get-started) is installed, and specify the path to the `nsys` command with `--nsys_cmd` if it is not in your PATH.
 - For more details on available engines and models, see the help string in
   the script or run:
 
@@ -135,34 +134,31 @@ time which would cause a difference for the overall category.
 
 ## Example 3: add new classification for a new model
 
-Suppose there's a new model ABC that is available for engine DEF, and say there
-are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
+To create a new engine DEF with model ABC, just add another json file in the same directory as
+gputrc2graph.py with the same format as the other json files. The script will automatically pick up all the json files in the same directory as engine/model specifications.
+
+Then, for this new model, suppose there are 4 kernels to be classified into "gemm" and "attn", where the gemm kernels
 have names with "*H*" or "*I*" in them, and attn kernels have names with "*J*"
-or "*K*" in them, add a new entry like so:
-
-```python
-engine_model = {
-        'DEF': {
-            'ABC': { 
-                'layer_anno': {
-                    'Stage': {
-                        '.*': 'layer',
-                    },
-                    'Substage': {
-                        'H|I': 'gemm',
-                        'J|K': 'attn',
-                        'CUDA mem': 'non-gpu-H_D_memops',
-                        '.*': 'misc'
-                    }
-                }
-            },
-        }
-      'vllm': {...}
+or "*K*" in them, just add another .json file in the same directory as
+gputrc2graph.py with the same format as the other json files, like the following:
+
+```json
+{
+  "DEF": {
+      "ABC": { 
+          "H|I": "gemm",
+          "J|K": "attn",
+          "CUDA mem": "non-gpu-H_D_memops",
+          ".*": "misc"
+      }
+  }
+}
 ```
 
-Basically Substage is a dictionary with a list of key/value pairs, where the
-keys are regex's of the kernel names to be classified, and values are the
-classification bins which one wishes to compare across engines/models.
+Each entry in the dictionary consists of:
+
+- key: a regex used to classify the kernels
+- value: the category to classify the kernels into.
 
 The last 2 entries are common for all engine/models, consisting of CUDA memory
 operations and a 'misc' for anything that's leftover and can't be classified.
@@ -173,3 +169,6 @@ like the following:
 ```bash
 --infile new.nsys-rep,DEF,ABC,<runtime>
 ```
+
+If the engine_DEF.json file already exists, just add the model as a new node in
+the existing engine file, after the other models.
diff --git a/tools/profiler/nsys_profile_tools/gputrc2graph.py b/tools/profiler/nsys_profile_tools/gputrc2graph.py
index 8921e1f20f3d..42dfede9e987 100755
--- a/tools/profiler/nsys_profile_tools/gputrc2graph.py
+++ b/tools/profiler/nsys_profile_tools/gputrc2graph.py
@@ -15,132 +15,18 @@
 
 
 # helper data class for annotating kernels
-class EngineModelData:
-    # engine + model mappings
-    engine_model = {
-        'vllm': {
-            'llama': {
-                'layer_anno': {
-                    'Stage': {
-                        '.*': 'layer',
-                    },
-                    'Substage': {
-                        'gemm': 'gemm',
-                        'fused_moe_kernel|GroupProblemShape|group_gemm_starts':
-                        'moe_gemm',  #llama4
-                        'moe|sigmoid': 'moe',  #llama4
-                        'CatArrayBatched|prepare_inputs': 'prepare_next',
-                        'flash': 'attn',
-                        'ncclDevKernel|cross_device_reduce':
-                        'nccl_and_custom_ar',
-                        '_norm_': 'norm',
-                        'act_and_mul_': 'silu',
-                        'rotary_embedding_kernel': 'rope',
-                        'SoftMax': 'softmax',
-                        'elementwise': 'elementwise',
-                        'fp8_quant': 'quantize',
-                        'reduce_kernel': 'reduce',
-                        'triton': 'triton_kernel',
-                        'CUDA mem': 'non-gpu-H_D_memops',
-                        '.*': 'misc'
-                    }
-                }
-            },
-            'ds': {
-                'layer_anno': {
-                    'Stage': {
-                        '.*': 'layer',
-                    },
-                    'Substage': {
-                        'block_fp8|gemm_fp8_blockwise':
-                        'block_fp8_gemm',
-                        'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal':
-                        'moe_gemm',
-                        'gemm|matmul|nvjet':
-                        'gemm',
-                        'moe|sigmoid|expert':
-                        'moe',
-                        '_fwd_|FlashAttn|_mla_|_attn_':
-                        'attn',
-                        'CatArrayBatched':
-                        'prepare_next',
-                        'ncclDevKernel|cross_device_reduce':
-                        'nccl_and_custom_ar',
-                        'Norm|_norm_':
-                        'norm',
-                        'sbtopk':
-                        'topk',
-                        'act_and_mul_':
-                        'activation',
-                        'compute_position_kernel':
-                        'rope',
-                        'elementwise':
-                        'elementwise',
-                        'fp8_quant|quant_fp8|cvt_fp16_to_fp4':
-                        'quantize',
-                        'reduce':
-                        'reduce',
-                        'SoftMax':
-                        'softmax',
-                        'triton':
-                        'triton_kernel',
-                        'CUDA mem':
-                        'non-gpu-H_D_memops',
-                        '.*':
-                        'misc'
-                    }
-                }
-            },
-            'gpt-oss': {
-                'layer_anno': {
-                    'Stage': {
-                        '.*': 'layer',
-                    },
-                    'Substage': {
-                        'block_fp8|gemm_fp8_blockwise':
-                        'block_fp8_gemm',
-                        'fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_'
-                        # this section is triton_moe_gemm
-                        '|matmul_ogs_|_topk_forward|_combined_routing'
-                        '|_sum_bitmatrix_rows|_compute_writeback_idx':
-                        'moe_gemm',
-                        'gemm|matmul|nvjet':
-                        'gemm',
-                        'moe|sigmoid|expert|splitKreduce':
-                        'moe',
-                        '_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha':
-                        'attn',
-                        'CatArrayBatched':
-                        'prepare_next',
-                        'ncclDevKernel|cross_device_reduce':
-                        'nccl_and_custom_ar',
-                        'Norm|_norm_':
-                        'norm',
-                        'sbtopk':
-                        'topk',
-                        'act_and_mul_':
-                        'activation',
-                        'compute_position_kernel':
-                        'rope',
-                        'elementwise':
-                        'elementwise',
-                        'fp8_quant|quant_fp8|cvt_fp16_to_fp4|quantize':
-                        'quantize',
-                        'reduce':
-                        'reduce',
-                        'SoftMax':
-                        'softmax',
-                        'triton':
-                        'triton_kernel',
-                        'CUDA mem':
-                        'non-gpu-H_D_memops',
-                        '.*':
-                        'misc'
-                    }
-                }
-            }
-        },
-    }
+def load_engine_model():
+    """ returns engine_model built from all json files in the current dir """
+    import glob
+    import json
+    engine_model = {}
+
+    json_files = glob.glob(
+        os.path.join(os.path.dirname(__file__) or ".", "*.json"))
+    for fname in json_files:
+        with open(fname, encoding="utf-8") as f:
+            engine_model.update(json.load(f))
+    return engine_model
 
 
 class GPUTrace2Graph:
@@ -148,8 +34,7 @@ class GPUTrace2Graph:
         Parses output of nsys report, generates csv and bar chart output
     """
 
-    def __init__(self, nsys_cmd):
-        self.nsys_cmd = nsys_cmd
+    def __init__(self):
         import pandas as pd  # avoid importing till needed
         self.pd = pd
         self.pd.options.mode.copy_on_write = True
@@ -227,7 +112,7 @@ def make_html(self, df, output_dir, title):
             title = 'Model_Engine'
         x = 'Model_Engine'
         y = 'Elapsed Time (sec)'
-        color = 'Substage'
+        color = 'Category'
         """ generate kernel mapping table  """
         # Sort Model_Engine categories by last field after underscore
         df['Model_Engine'] = self.pd.Categorical(
@@ -249,14 +134,13 @@ def make_html(self, df, output_dir, title):
             Generate data table with columns per Model_Engine into result.html
         """
         pivot_df = df.pivot_table(values='Elapsed Time (sec)',
-                                  index='Substage',
+                                  index='Category',
                                   columns='Model_Engine',
                                   aggfunc='sum',
                                   observed=False).round(2)
         # Add sum row at bottom
         pivot_df.loc['total_elapsed_sec'] = pivot_df.sum()
         pivot_df.fillna('').to_html('temp.html')
-        print('got')
         with (open(f'{output_name}.html', 'a', encoding='utf-8') as
               outfile, open('temp.html', encoding='utf-8') as infile):
             outfile.write(infile.read())
@@ -264,23 +148,22 @@ def make_html(self, df, output_dir, title):
 
         print(f'Finished generating: \n'
               f' {output_name}.html for stack bar chart \n'
-              f' {output_name}.csv for Kernel-Substage mapping')
+              f' {output_name}.csv for Kernel-Category mapping')
 
     def anno_gpu_kernname(self, df, mapping):
-        """ add "stage" and "substage" columns """
+        """ add "Category" column """
 
-        def anno_gpu_kernname_helper(name, stage):
-            for kern_name, val in mapping['layer_anno'][stage].items():
+        def anno_gpu_kernname_helper(name):
+            for kern_name, val in mapping.items():
                 if re.search(kern_name, name):
                     return val
 
-        for stage in ['Stage', 'Substage']:
-            df[stage] = df['Name'].apply(anno_gpu_kernname_helper, stage=stage)
+        df['Category'] = df['Name'].apply(anno_gpu_kernname_helper)
 
     def make_nongpu_row(self, df, nongpu_sec):
         """ this will append non-gpu time entry at end of df """
         nongpu_row = self.pd.DataFrame([df.iloc[-1]])
-        nongpu_row['Substage'] = nongpu_row['Name'] = 'CPU(non-GPU)'
+        nongpu_row['Category'] = nongpu_row['Name'] = 'CPU(non-GPU)'
         nongpu_row['Instances'] = 1
         nongpu_row['Elapsed Time (sec)'] = nongpu_sec
         return (nongpu_row)
@@ -302,7 +185,7 @@ def should_gen_file(self, new_file, base_file):
             logger.info('generating %s', new_file)
             return True
 
-    def gen_sum_file(self, file):
+    def gen_sum_file(self, file, nsys_cmd):
         """ 
             generates sum file from nsys trace with times per kernel and
             returns the name of the sum file
@@ -318,17 +201,21 @@ def gen_sum_file(self, file):
         sum_file = f'{file_dir}/{file_name}_cuda_gpu_kernel_tracesum.csv'
         if self.should_gen_file(nsys_stats_file, file):
             cmd = [
-                self.nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
+                nsys_cmd, 'stats', '-r', 'cuda_gpu_trace', file, '-o',
                 f'{file_dir}/{file_name}'
             ]
             cmd_str = ' '.join(cmd)
             logger.info('+ %s', cmd_str)
+            # estimate time based on calibrated 240M/min
+            file_size_mb = os.path.getsize(file) / 1e6
+            logger.info(
+                'nsys stats for %.2f MB file expected to take %.2f min',
+                file_size_mb, file_size_mb / 240)
             try:
-                subprocess.run(cmd)
+                subprocess.run(cmd, check=True)
             except Exception:
-                logger.error(
-                    "%s failed, specify --nsys_cmd for correct nsys path",
-                    cmd_str)
+                logger.error("%s failed; Use --nsys_cmd to specify nsys path",
+                             cmd_str)
                 exit(1)
             logger.info('generating non-overalapped sum %s', sum_file)
             self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file)
@@ -336,7 +223,7 @@ def gen_sum_file(self, file):
         logger.info('Finished generating %s', sum_file)
         return sum_file
 
-    def gen_graph(self, in_file, out_dir, title):
+    def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model):
         """ generates graph and csv file from in_file into out_dir """
         # Initialize an empty DataFrame to store combined data
         combined_df = self.pd.DataFrame()
@@ -345,17 +232,16 @@ def gen_graph(self, in_file, out_dir, title):
             file_name = os.path.basename(file)
             if not file_dir:
                 file_dir = '.'
-            sum_file = self.gen_sum_file(file)
+            sum_file = self.gen_sum_file(file, nsys_cmd)
             # read kernel summary file
             df = self.pd.read_csv(sum_file)
             # annotate kernel to their categories
-            assert EngineModelData.engine_model.get(engine)
-            assert EngineModelData.engine_model[engine].get(model)
+            assert engine_model.get(engine), f'engine {engine} unknown'
+            assert engine_model[engine].get(model), f'model {model} unknown'
             # remove nsys-rep from file_name for shorter x-label
             file_name = file_name.replace('.nsys-rep', '')
             df['Model_Engine'] = f'{model}_{engine}_{file_name}_{idx}'
-            self.anno_gpu_kernname(df,
-                                   EngineModelData.engine_model[engine][model])
+            self.anno_gpu_kernname(df, engine_model[engine][model])
             # patch in non-gpu time
             gpu_sec = round(df['Elapsed Time (sec)'].sum(), 1)
             total_sec = round(float(total_sec), 1)
@@ -393,12 +279,12 @@ def main():
             "--out_dir results/ --title \"Model=gpt-oss vLLM chart\""),
         formatter_class=argparse.RawDescriptionHelpFormatter)
 
-    # Build help string showing available engine/model combinations
-    engine_model_help = []
-    for engine, models in EngineModelData.engine_model.items():
-        model_list = list(models.keys())
-        engine_model_help.append(f"{engine}:[{','.join(model_list)}]")
-    engine_model_str = ' '.join(engine_model_help)
+    # load supported engine_model
+    engine_model_supported = load_engine_model()
+    # Get a string representation of supported engine/model combinations
+    engine_model_supported_str = ', '.join(
+        f"{engine}:[{', '.join(models.keys())}]"
+        for engine, models in engine_model_supported.items())
     parser.add_argument(
         '--in_file',
         type=parse_tuple,
@@ -408,7 +294,7 @@ def main():
             'separated by space. Elapsed_nonprofiled_sec is runtime without '
             'profiling used to calculate non-gpu time. Specify 0 to use '
             'elapsed time from nsys-rep but that might inflate non-gpu time. '
-            f'Available engine:[model] are: {engine_model_str} '
+            f'Available engine:[model] are: {engine_model_supported_str} '
             f'Example: --infile d1.nsys-rep,vllm,llama,100 '
             'd2.nsys-rep,vllm,gpt-oss,102'),
         required=True)
@@ -418,8 +304,9 @@ def main():
                         help=('nsys cmd, e.g. /usr/bin/nsys, Default: nsys'),
                         default="nsys")
     args = parser.parse_args()
-    gputrace = GPUTrace2Graph(args.nsys_cmd)
-    gputrace.gen_graph(args.in_file, args.out_dir, args.title)
+    gputrace = GPUTrace2Graph()
+    gputrace.gen_graph(args.in_file, args.out_dir, args.title, args.nsys_cmd,
+                       engine_model_supported)
 
 
 if __name__ == '__main__':
diff --git a/tools/profiler/nsys_profile_tools/vllm_engine_model.json b/tools/profiler/nsys_profile_tools/vllm_engine_model.json
new file mode 100644
index 000000000000..264c628dded3
--- /dev/null
+++ b/tools/profiler/nsys_profile_tools/vllm_engine_model.json
@@ -0,0 +1,63 @@
+{
+  "vllm": {
+    "llama": {
+      "fused_moe_kernel|GroupProblemShape|group_gemm_starts|bmm_|GemmUniversal": "moe_gemm",
+      "gemm|nvjet": "gemm",
+      "moe|sigmoid": "moe",
+      "CatArrayBatched|prepare_inputs": "prepare_next",
+      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
+      "_norm_|Norm": "norm",
+      "act_and_mul_": "activation",
+      "Rotary": "rope",
+      "SoftMax": "softmax",
+      "flash|fmha": "attn",
+      "elementwise": "elementwise",
+      "fp8_quant|cvt_": "quantize",
+      "reduce_kernel": "reduce",
+      "triton": "triton_kernel",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    },
+    "ds": {
+      "block_fp8|gemm_fp8_blockwise": "block_fp8_gemm",
+      "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_": "moe_gemm",
+      "gemm|matmul|nvjet": "gemm",
+      "moe|sigmoid|expert": "moe",
+      "CatArrayBatched": "prepare_next",
+      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
+      "Norm|_norm_": "norm",
+      "sbtopk": "topk",
+      "act_and_mul_": "activation",
+      "compute_position_kernel": "rope",
+      "elementwise": "elementwise",
+      "fp8_quant|quant_fp8|cvt_": "quantize",
+      "reduce": "reduce",
+      "SoftMax": "softmax",
+      "_fwd_|FlashAttn|_mla_|_attn_|fmha": "attn",
+      "triton": "triton_kernel",
+      "topk": "topk",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    },
+    "gpt-oss": {
+      "block_fp8|gemm_fp8_blockwise": "block_fp8_gemm",
+      "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm",
+      "gemm|matmul|nvjet": "gemm",
+      "moe|sigmoid|expert|splitKreduce": "moe",
+      "CatArrayBatched": "prepare_next",
+      "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar",
+      "Norm|_norm_": "norm",
+      "topk": "topk",
+      "act_and_mul_": "activation",
+      "compute_position_kernel": "rope",
+      "elementwise": "elementwise",
+      "fp8_quant|quant_fp8|cvt_|quantize": "quantize",
+      "reduce": "reduce",
+      "SoftMax": "softmax",
+      "_fwd_|FlashAttn|_mla_|_attn_|_flash_|flash::prepare_varlen|fmha": "attn",
+      "triton": "triton_kernel",
+      "CUDA mem": "non-gpu-H_D_memops",
+      ".*": "misc"
+    }
+  }
+}
\ No newline at end of file

From 838152c576bbb4e08a6d354771300f6c98549f3a Mon Sep 17 00:00:00 2001
From: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Date: Tue, 19 Aug 2025 10:48:07 +0300
Subject: [PATCH 183/233] [Benchmark] Add flag --served-model-name to
 benchmark_serving_multi_turn (#22889)

Signed-off-by: daniels <daniels@pliops.com>
---
 benchmarks/multi_turn/README.md                    | 12 +++++++-----
 .../multi_turn/benchmark_serving_multi_turn.py     | 14 +++++++++++++-
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index ae0866ae6075..7adf97bcf562 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -5,11 +5,13 @@ The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `re
 First start serving your model
 
 ```bash
-export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
 
-vllm serve $MODEL_NAME --disable-log-requests
+vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests
 ```
 
+The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
+
 ## Synthetic Multi-Turn Conversations
 
 Download the following text file (used for generation of synthetic conversations)
@@ -26,10 +28,10 @@ But you may use other text files if you prefer (using this specific file is not
 Then run the benchmarking script
 
 ```bash
-export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
+export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
 
-python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
---num-clients 2 --max-active-conversations 6
+python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \
+--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6
 ```
 
 You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 53c3207491d1..d23b7b6e4571 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -825,9 +825,11 @@ def get_client_config(
 
     # Arguments for API requests
     chat_url = f"{args.url}/v1/chat/completions"
+    model_name = args.served_model_name if args.served_model_name else args.model
+
     req_args = RequestArgs(
         chat_url=chat_url,
-        model=args.model,
+        model=model_name,
         stream=not args.no_stream,
         limit_min_tokens=args.limit_min_tokens,
         limit_max_tokens=args.limit_max_tokens,
@@ -1247,9 +1249,19 @@ async def main() -> None:
         default=0,
         help="Seed for random number generators (default: 0)",
     )
+
     parser.add_argument(
         "-m", "--model", type=str, required=True, help="Path of the LLM model"
     )
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the ``--model`` argument. ",
+    )
+
     parser.add_argument(
         "-u",
         "--url",

From 34fbb33ee40e70bed47eb774b9dd9a6341c9ce3b Mon Sep 17 00:00:00 2001
From: qizixi <22851944+zixi-qi@users.noreply.github.com>
Date: Tue, 19 Aug 2025 16:56:31 +0900
Subject: [PATCH 184/233] Fix GLM-4.5V-FP8 numerical issue (#22949)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 examples/offline_inference/vision_language.py | 80 ++++++++++++++++++-
 .../vision_language_multi_image.py            | 72 +++++++++++++++++
 vllm/model_executor/models/glm4_1v.py         |  7 +-
 3 files changed, 154 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 988ad35cdd7e..a13b6a9225ae 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -333,6 +333,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# GLM-4.5V
+def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.5V"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        mm_processor_kwargs={
+            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+
+    if modality == "image":
+        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    elif modality == "video":
+        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    prompts = [
+        (
+            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
+            f"{placeholder}"
+            f"{question}<|assistant|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
+# GLM-4.5V-FP8
+def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.5V-FP8"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        mm_processor_kwargs={
+            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+
+    if modality == "image":
+        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    elif modality == "video":
+        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    prompts = [
+        (
+            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
+            f"{placeholder}"
+            f"{question}<|assistant|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # H2OVL-Mississippi
 def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -383,8 +457,8 @@ def run_hyperclovax_seed_vision(
     for question in questions:
         if modality == "image":
             """
-            ocr: List the words in the image in raster order. 
-                Even if the word order feels unnatural for reading, 
+            ocr: List the words in the image in raster order.
+                Even if the word order feels unnatural for reading,
                 the model will handle it as long as it follows raster order.
                 e.g. "Naver, CLOVA, bigshane"
             lens_keywords: List the entity names in the image.
@@ -1448,6 +1522,8 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     "gemma3n": run_gemma3n,
     "glm4v": run_glm4v,
     "glm4_1v": run_glm4_1v,
+    "glm4_5v": run_glm4_5v,
+    "glm4_5v_fp8": run_glm4_5v_fp8,
     "h2ovl_chat": run_h2ovl,
     "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 799337ed6850..56519c95f822 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1064,6 +1064,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# GLM-4.5V
+def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.5V"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    processor = AutoProcessor.from_pretrained(model_name)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
+# GLM-4.5V-FP8
+def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.5V-FP8"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    processor = AutoProcessor.from_pretrained(model_name)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 model_example_map = {
     "aria": load_aria,
     "aya_vision": load_aya_vision,
@@ -1096,6 +1166,8 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
     "step3": load_step3,
     "tarsier": load_tarsier,
     "tarsier2": load_tarsier2,
+    "glm4_5v": load_glm4_5v,
+    "glm4_5v_fp8": load_glm4_5v_fp8,
 }
 
 
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index 015577322ffe..08252c51310b 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -126,7 +126,7 @@ class Glm4vVideoPixelInputs(TensorSchema):
         - ctpp: Number of channels * temporal_patch_size *
             patch_size * patch_size
         - f: Number of frames
-        - g: Grid dimensions (3 for grid_t which is usually 1 for processed 
+        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
           video, grid_h, grid_w)
     """
     type: Literal["pixel_values_videos"] = "pixel_values_videos"
@@ -141,7 +141,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
         - p: Number of video patches across all frames
         - h: Hidden size (must match language model backbone)
         - f: Number of frames
-        - g: Grid dimensions (3 for grid_t which is usually 1 for processed 
+        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
           video, grid_h, grid_w)
     """
     type: Literal["video_embeds"] = "video_embeds"
@@ -234,7 +234,8 @@ def __init__(
             total_num_kv_heads=num_heads,
             bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.qkv",
+            # Change qkv prefix to align with GLM-4.5V-FP8 quantization config
+            prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
         )
         self.proj = RowParallelLinear(
             input_size=projection_size,

From fa0e55efe0a0422d862cd4af0a6d21c341db3cfb Mon Sep 17 00:00:00 2001
From: hustxiayang <yangxiast@gmail.com>
Date: Tue, 19 Aug 2025 04:32:18 -0400
Subject: [PATCH 185/233] [Misc] Add request_id into benchmark_serve.py
 (#23065)

Signed-off-by: yangxia <yangxiast@gmail.com>
---
 benchmarks/backend_request_func.py           |  23 +++-
 benchmarks/benchmark_dataset.py              | 109 ++++++++++++++----
 benchmarks/benchmark_serving.py              |  23 +++-
 vllm/benchmarks/datasets.py                  | 113 +++++++++++++++----
 vllm/benchmarks/lib/endpoint_request_func.py |   7 ++
 vllm/benchmarks/serve.py                     |  14 ++-
 6 files changed, 243 insertions(+), 46 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 1559ca2d9284..ba7c733be0b2 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -34,6 +34,7 @@ class RequestFuncInput:
     multi_modal_content: Optional[dict | list[dict]] = None
     ignore_eos: bool = False
     language: Optional[str] = None
+    request_id: Optional[str] = None
 
 
 @dataclass
@@ -71,6 +72,9 @@ async def async_request_tgi(
             "inputs": request_func_input.prompt,
             "parameters": params,
         }
+        headers = None
+        if request_func_input.request_id:
+            headers = {"x-request-id": request_func_input.request_id}
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
         if request_func_input.ignore_eos:
@@ -82,7 +86,9 @@ async def async_request_tgi(
         st = time.perf_counter()
         most_recent_timestamp = st
         try:
-            async with session.post(url=api_url, json=payload) as response:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
                 if response.status == 200:
                     async for chunk_bytes in response.content:
                         chunk_bytes = chunk_bytes.strip()
@@ -145,6 +151,9 @@ async def async_request_trt_llm(
         }
         if request_func_input.ignore_eos:
             payload["min_length"] = request_func_input.output_len
+        headers = None
+        if request_func_input.request_id:
+            headers = {"x-request-id": request_func_input.request_id}
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
 
@@ -152,7 +161,9 @@ async def async_request_trt_llm(
         st = time.perf_counter()
         most_recent_timestamp = st
         try:
-            async with session.post(url=api_url, json=payload) as response:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
                 if response.status == 200:
                     async for chunk_bytes in response.content:
                         chunk_bytes = chunk_bytes.strip()
@@ -211,6 +222,8 @@ async def async_request_deepspeed_mii(
             "top_p": 1.0,
         }
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
 
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -283,6 +296,8 @@ async def async_request_openai_completions(
         if request_func_input.extra_body:
             payload.update(request_func_input.extra_body)
         headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
 
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -395,6 +410,8 @@ async def async_request_openai_chat_completions(
             "Content-Type": "application/json",
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         }
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
 
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
@@ -491,6 +508,8 @@ async def async_request_openai_audio(
         headers = {
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
         }
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
 
         # Send audio file
         def to_bytes(y, sr):
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 572292a5aca4..c62934ed94cb 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -19,6 +19,7 @@
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
+from copy import deepcopy
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
@@ -54,6 +55,7 @@ class SampleRequest:
     expected_output_len: int
     multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
     lora_request: Optional[LoRARequest] = None
+    request_id: Optional[str] = None
 
 
 # -----------------------------------------------------------------------------
@@ -155,7 +157,10 @@ def get_random_lora_request(
 
     @abstractmethod
     def sample(
-        self, tokenizer: PreTrainedTokenizerBase, num_requests: int
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        request_id_prefix: str = "",
     ) -> list[SampleRequest]:
         """
         Abstract method to generate sample requests from the dataset.
@@ -167,6 +172,7 @@ def sample(
             tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
              for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
+            request_id_prefix (str) The prefix of request_id.
 
         Returns:
             list[SampleRequest]: A list of sample requests generated from the
@@ -175,7 +181,10 @@ def sample(
         raise NotImplementedError("sample must be implemented in subclasses.")
 
     def maybe_oversample_requests(
-        self, requests: list[SampleRequest], num_requests: int
+        self,
+        requests: list[SampleRequest],
+        num_requests: int,
+        request_id_prefix: str = "",
     ) -> None:
         """
         Oversamples the list of requests if its size is less than the desired
@@ -183,11 +192,18 @@ def maybe_oversample_requests(
 
         Args:
             requests (List[SampleRequest]): The current list of sampled
-            requests.  num_requests (int): The target number of requests.
+            requests.
+            num_requests (int): The target number of requests.
+            request_id_prefix (str) The prefix of the request ids.
         """
         if len(requests) < num_requests:
             random.seed(self.random_seed)
-            additional = random.choices(requests, k=num_requests - len(requests))
+            additional = deepcopy(
+                random.choices(requests, k=num_requests - len(requests))
+            )
+            for i in range(len(additional)):
+                req = additional[i]
+                req.request_id = request_id_prefix + str(len(requests) + i)
             requests.extend(additional)
             logger.info("Oversampled requests to reach %d total samples.", num_requests)
 
@@ -303,6 +319,7 @@ def sample(
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         # Enforce range_ratio < 1
@@ -363,8 +380,10 @@ def sample(
                     prompt=prompt,
                     prompt_len=total_input_len,
                     expected_output_len=int(output_lens[i]),
+                    request_id=request_id_prefix + str(i),
                 )
             )
+
         return requests
 
 
@@ -406,9 +425,11 @@ def sample(
         max_loras: Optional[int] = None,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         samples: list = []
+        ind = 0
         for entry in self.data:
             if len(samples) >= num_requests:
                 break
@@ -444,9 +465,11 @@ def sample(
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
-        self.maybe_oversample_requests(samples, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
         return samples
 
 
@@ -512,10 +535,11 @@ def sample(
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["prompt"]
@@ -534,9 +558,12 @@ def sample(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
 
         return sampled_requests
 
@@ -578,6 +605,7 @@ def sample(
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
         return_prompt_formatted: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         # Calculate average token length for a poem line.
@@ -603,6 +631,7 @@ def sample(
         prefix_lines = self.data[:num_prefix_lines]
 
         samples = []
+        ind = 0
         while len(samples) < num_requests:
             extra_lines = random.choices(
                 self.data, k=num_input_lines - num_prefix_lines
@@ -613,14 +642,17 @@ def sample(
                 msg, add_generation_prompt=True, tokenize=False
             )
             prompt_len = len(tokenizer(prompt_formatted).input_ids)
+
             if prompt_len <= input_len:
                 samples.append(
                     SampleRequest(
                         prompt=prompt_formatted if return_prompt_formatted else prompt,
                         prompt_len=prompt_len,
                         expected_output_len=output_len,
+                        request_id=request_id_prefix + str(ind),
                     )
                 )
+                ind += 1
         return samples
 
 
@@ -672,6 +704,7 @@ def sample(
         num_requests: int,
         max_loras: Optional[int] = None,
         lora_path: Optional[str] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         samples = []
@@ -693,6 +726,7 @@ def sample(
                     prompt_len=input_len,
                     expected_output_len=output_len,
                     lora_request=lora_req,
+                    request_id=request_id_prefix + str(i),
                 )
             )
         return samples
@@ -752,12 +786,14 @@ def sample(
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         # Filter examples with at least 2 conversations
         filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
         sampled_requests = []
         dynamic_output = output_len is None
+        ind = 0
 
         for item in filtered_data:
             if len(sampled_requests) >= num_requests:
@@ -785,9 +821,13 @@ def sample(
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -814,11 +854,12 @@ def sample(
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
@@ -838,9 +879,12 @@ def sample(
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(i),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -870,11 +914,12 @@ def sample(
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = f"{item['input']}\n\n{item['instruction']} Just output \
@@ -892,9 +937,12 @@ def sample(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -924,12 +972,13 @@ def sample(
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
         sampled_requests = []
 
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["turns"][0]
@@ -947,9 +996,12 @@ def sample(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -974,10 +1026,12 @@ def sample(
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         output_len: Optional[int] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         sampled_requests = []
         dynamic_output = output_len is None
+        ind = 0
 
         for item in self.data:
             if len(sampled_requests) >= num_requests:
@@ -1000,9 +1054,13 @@ def sample(
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=None,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
 
 
@@ -1072,12 +1130,18 @@ class NextEditPredictionDataset(HuggingFaceDataset):
         "zed-industries/zeta": _format_zeta_prompt,
     }
 
-    def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        request_id_prefix: str = "",
+        **kwargs,
+    ):
         formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
         if formatting_prompt_func is None:
             raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
         samples = []
-        for sample in self.data:
+        for i, sample in enumerate(self.data):
             sample = formatting_prompt_func(sample)
             samples.append(
                 SampleRequest(
@@ -1086,11 +1150,12 @@ def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs
                     expected_output_len=len(
                         tokenizer(sample["expected_output"]).input_ids
                     ),
+                    request_id=request_id_prefix + str(i),
                 )
             )
             if len(samples) >= num_requests:
                 break
-        self.maybe_oversample_requests(samples, num_requests)
+        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
         return samples
 
 
@@ -1139,6 +1204,7 @@ def sample(
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         output_len: Optional[int] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         import librosa
@@ -1148,6 +1214,7 @@ def sample(
         prompt_len = len(tokenizer(prompt).input_ids)
         sampled_requests = []
         skipped = 0
+        ind = 0
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
@@ -1166,8 +1233,10 @@ def sample(
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
+            ind += 1
         if skipped:
             logger.warning(
                 "%d samples discarded from dataset due to"
@@ -1175,5 +1244,7 @@ def sample(
                 " what Whisper supports.",
                 skipped,
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(
+            sampled_requests, num_requests, request_id_prefix
+        )
         return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index ae38caf7290b..02f5f585c0c1 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -375,11 +375,12 @@ async def limited_request_func(request_func_input, pbar):
                     rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
                 last_int_rps = current_int_rps
 
-        prompt, prompt_len, output_len, mm_content = (
+        prompt, prompt_len, output_len, mm_content, request_id = (
             request.prompt,
             request.prompt_len,
             request.expected_output_len,
             request.multi_modal_data,
+            request.request_id,
         )
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
@@ -397,6 +398,7 @@ async def limited_request_func(request_func_input, pbar):
             multi_modal_content=mm_content,
             ignore_eos=ignore_eos,
             extra_body=extra_body,
+            request_id=request_id,
         )
         task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
         tasks.append(asyncio.create_task(task))
@@ -665,6 +667,7 @@ def main(args: argparse.Namespace):
             tokenizer=tokenizer,
             output_len=args.custom_output_len,
             skip_chat_template=args.custom_skip_chat_template,
+            request_id_prefix=args.request_id_prefix,
         )
 
     elif args.dataset_name == "sonnet":
@@ -678,6 +681,7 @@ def main(args: argparse.Namespace):
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
                 return_prompt_formatted=False,
+                request_id_prefix=args.request_id_prefix,
             )
         else:
             assert tokenizer.chat_template or tokenizer.default_chat_template, (
@@ -690,6 +694,7 @@ def main(args: argparse.Namespace):
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
                 return_prompt_formatted=True,
+                request_id_prefix=args.request_id_prefix,
             )
 
     elif args.dataset_name == "hf":
@@ -751,6 +756,7 @@ def main(args: argparse.Namespace):
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.hf_output_len,
+            request_id_prefix=args.request_id_prefix,
         )
 
     else:
@@ -762,10 +768,15 @@ def main(args: argparse.Namespace):
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
                 output_len=args.sharegpt_output_len,
+                request_id_prefix=args.request_id_prefix,
             ),
             "burstgpt": lambda: BurstGPTDataset(
                 random_seed=args.seed, dataset_path=args.dataset_path
-            ).sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                request_id_prefix=args.request_id_prefix,
+            ),
             "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample(
                 tokenizer=tokenizer,
                 num_requests=args.num_prompts,
@@ -773,6 +784,7 @@ def main(args: argparse.Namespace):
                 input_len=args.random_input_len,
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
+                request_id_prefix=args.request_id_prefix,
             ),
         }
 
@@ -1118,6 +1130,13 @@ def create_argument_parser():
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
         "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
     )
+    parser.add_argument(
+        "--request-id-prefix",
+        type=str,
+        required=False,
+        default="benchmark-serving",
+        help="Specify the prefix of request id.",
+    )
 
     # group for dataset specific arguments
     custom_group = parser.add_argument_group("custom dataset options")
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 72d7ce49b8e1..b575e8b9e0a0 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -18,6 +18,7 @@
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
+from copy import deepcopy
 from dataclasses import dataclass
 from functools import cache
 from io import BytesIO
@@ -76,6 +77,7 @@ class SampleRequest:
         Union[MultiModalDataDict, dict, list[dict]]
     ] = None
     lora_request: Optional[LoRARequest] = None
+    request_id: Optional[str] = None
 
 
 # -----------------------------------------------------------------------------
@@ -183,7 +185,8 @@ def get_random_lora_request(
 
     @abstractmethod
     def sample(self, tokenizer: PreTrainedTokenizerBase,
-               num_requests: int) -> list[SampleRequest]:
+               num_requests: int, 
+               request_id_prefix: str = "") -> list[SampleRequest]:
         """
         Abstract method to generate sample requests from the dataset.
 
@@ -194,6 +197,8 @@ def sample(self, tokenizer: PreTrainedTokenizerBase,
             tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
                 for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
+            request_id_prefix (str) The prefix of request_id.
+            
 
         Returns:
             list[SampleRequest]: A list of sample requests generated from the
@@ -201,8 +206,12 @@ def sample(self, tokenizer: PreTrainedTokenizerBase,
         """
         raise NotImplementedError("sample must be implemented in subclasses.")
 
-    def maybe_oversample_requests(self, requests: list[SampleRequest],
-                                  num_requests: int) -> None:
+    def maybe_oversample_requests(
+        self,
+        requests: list[SampleRequest],
+        num_requests: int,
+        request_id_prefix: str = "",
+    ) -> None:
         """
         Oversamples the list of requests if its size is less than the desired
         number.
@@ -211,11 +220,17 @@ def maybe_oversample_requests(self, requests: list[SampleRequest],
             requests (List[SampleRequest]): The current list of sampled
                 requests.
             num_requests (int): The target number of requests.
+            request_id_prefix (str) The prefix of the request ids.
+
         """
         if len(requests) < num_requests:
             random.seed(self.random_seed)
-            additional = random.choices(requests,
-                                        k=num_requests - len(requests))
+            additional = deepcopy(
+                random.choices(requests, k=num_requests - len(requests))
+            )
+            for i in range(len(additional)):
+                req = additional[i]
+                req.request_id = request_id_prefix + str(len(requests) + i)
             requests.extend(additional)
             logger.info("Oversampled requests to reach %d total samples.",
                         num_requests)
@@ -334,6 +349,7 @@ def sample(
         range_ratio: float = DEFAULT_RANGE_RATIO,
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         # Enforce range_ratio < 1
@@ -391,6 +407,7 @@ def sample(
                     prompt=prompt,
                     prompt_len=total_input_len,
                     expected_output_len=int(output_lens[i]),
+                    request_id=request_id_prefix + str(i),
                 ))
         return requests
 
@@ -432,9 +449,11 @@ def sample(
         max_loras: Optional[int] = None,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         samples: list = []
+        ind = 0
         for entry in self.data:
             if len(samples) >= num_requests:
                 break
@@ -470,8 +489,10 @@ def sample(
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 ))
-        self.maybe_oversample_requests(samples, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
         return samples
 
 
@@ -647,6 +668,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             tokenizer=tokenizer,
             output_len=args.custom_output_len,
             skip_chat_template=args.custom_skip_chat_template,
+            request_id_prefix=args.request_id_prefix,
         )
 
     elif args.dataset_name == "sonnet":
@@ -660,6 +682,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
                 return_prompt_formatted=False,
+                request_id_prefix=args.request_id_prefix,
             )
         else:
             assert tokenizer.chat_template or tokenizer.default_chat_template, (
@@ -671,6 +694,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
                 return_prompt_formatted=True,
+                request_id_prefix=args.request_id_prefix,
             )
 
     elif args.dataset_name == "hf":
@@ -730,6 +754,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             output_len=args.hf_output_len,
+            request_id_prefix=args.request_id_prefix,
         )
 
     else:
@@ -741,11 +766,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                                         tokenizer=tokenizer,
                                         num_requests=args.num_prompts,
                                         output_len=args.sharegpt_output_len,
+                                        request_id_prefix=args.request_id_prefix,
                                     ),
             "burstgpt":
             lambda: BurstGPTDataset(random_seed=args.seed,
                                     dataset_path=args.dataset_path).
-            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            sample(tokenizer=tokenizer, num_requests=args.num_prompts, 
+                   request_id_prefix=args.request_id_prefix,),
             "random":
             lambda: RandomDataset(random_seed=args.seed,
                                   dataset_path=args.dataset_path).sample(
@@ -755,6 +782,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 input_len=args.random_input_len,
                 output_len=args.random_output_len,
                 range_ratio=args.random_range_ratio,
+                request_id_prefix=args.request_id_prefix,
             ),
             "prefix_repetition":
             lambda: PrefixRepetitionRandomDataset(
@@ -766,6 +794,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 suffix_len=args.prefix_repetition_suffix_len,
                 num_prefixes=args.prefix_repetition_num_prefixes,
                 output_len=args.prefix_repetition_output_len,
+                request_id_prefix=args.request_id_prefix,
             ),
         }
 
@@ -839,10 +868,11 @@ def sample(
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
         skip_chat_template: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["prompt"]
@@ -864,8 +894,10 @@ def sample(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
 
         return sampled_requests
 
@@ -909,6 +941,7 @@ def sample(
         input_len: int = DEFAULT_INPUT_LEN,
         output_len: int = DEFAULT_OUTPUT_LEN,
         return_prompt_formatted: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         # Calculate average token length for a poem line.
@@ -934,6 +967,7 @@ def sample(
         prefix_lines = self.data[:num_prefix_lines]
 
         samples = []
+        ind = 0
         while len(samples) < num_requests:
             extra_lines = random.choices(self.data,
                                          k=num_input_lines - num_prefix_lines)
@@ -949,7 +983,9 @@ def sample(
                         if return_prompt_formatted else prompt,
                         prompt_len=prompt_len,
                         expected_output_len=output_len,
+                         request_id=request_id_prefix + str(ind),
                     ))
+                ind += 1
         return samples
 
 
@@ -1000,6 +1036,7 @@ def sample(
         num_requests: int,
         max_loras: Optional[int] = None,
         lora_path: Optional[str] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         samples = []
@@ -1020,6 +1057,7 @@ def sample(
                     prompt_len=input_len,
                     expected_output_len=output_len,
                     lora_request=lora_req,
+                    request_id=request_id_prefix + str(i),
                 ))
         return samples
 
@@ -1075,11 +1113,13 @@ def sample(self,
                num_requests: int,
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
+               request_id_prefix: str = "",
                **kwargs) -> list:
         # Filter examples with at least 2 conversations
         filtered_data = self.data.filter(
             lambda x: len(x["conversations"]) >= 2)
         sampled_requests = []
+        ind = 0
         dynamic_output = output_len is None
 
         for item in filtered_data:
@@ -1111,8 +1151,11 @@ def sample(self,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1141,12 +1184,13 @@ def sample(
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
@@ -1168,8 +1212,10 @@ def sample(
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(i),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1198,11 +1244,12 @@ def sample(self,
                num_requests: int,
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
+               request_id_prefix: str = "",
                **kwargs) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         sampled_requests = []
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = f"{item['input']}\n\n{item['instruction']} Just output \
@@ -1224,8 +1271,10 @@ def sample(self,
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1255,13 +1304,14 @@ def sample(
         num_requests: int,
         output_len: Optional[int] = None,
         enable_multimodal_chat: bool = False,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         sampled_requests = []
 
-        for item in self.data:
+        for i, item in enumerate(self.data):
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["turns"][0]
@@ -1282,8 +1332,10 @@ def sample(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
+                    request_id=request_id_prefix + str(i),
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1305,8 +1357,10 @@ def sample(self,
                tokenizer: PreTrainedTokenizerBase,
                num_requests: int,
                output_len: Optional[int] = None,
+               request_id_prefix: str = "",
                **kwargs) -> list:
         sampled_requests = []
+        ind = 0
         dynamic_output = output_len is None
 
         for item in self.data:
@@ -1331,8 +1385,12 @@ def sample(self,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=None,
+                    request_id=request_id_prefix + str(ind),
+                    
                 ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+            ind += 1
+        self.maybe_oversample_requests(sampled_requests, num_requests,
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1403,13 +1461,14 @@ class NextEditPredictionDataset(HuggingFaceDataset):
     }
 
     def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
+               request_id_prefix: str = "",
                **kwargs):
         formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
             self.dataset_path)
         if formatting_prompt_func is None:
             raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
         samples = []
-        for sample in self.data:
+        for i, sample in enumerate(self.data):
             sample = formatting_prompt_func(sample)
             samples.append(
                 SampleRequest(
@@ -1417,10 +1476,11 @@ def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
                     prompt_len=len(tokenizer(sample["prompt"]).input_ids),
                     expected_output_len=len(
                         tokenizer(sample["expected_output"]).input_ids),
+                    request_id=request_id_prefix + str(i),
                 ))
             if len(samples) >= num_requests:
                 break
-        self.maybe_oversample_requests(samples, num_requests)
+        self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
         return samples
 
 
@@ -1470,6 +1530,7 @@ def sample(
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         output_len: Optional[int] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list:
         output_len = (output_len
@@ -1477,6 +1538,7 @@ def sample(
         prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
         prompt_len = len(tokenizer(prompt).input_ids)
         sampled_requests = []
+        ind = 0
         skipped = 0
         for item in self.data:
             if len(sampled_requests) >= num_requests:
@@ -1496,7 +1558,9 @@ def sample(
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
+                    request_id=request_id_prefix + str(ind),
                 ))
+            ind += 1
         if skipped:
             logger.warning(
                 "%d samples discarded from dataset due to"
@@ -1504,7 +1568,8 @@ def sample(
                 " what Whisper supports.",
                 skipped,
             )
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1541,11 +1606,13 @@ def sample(
         tokenizer: PreTrainedTokenizerBase,
         num_requests: int,
         output_len: Optional[int] = None,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         # Force dynamic output length based on reference completion.
         dynamic_output = output_len is None
         sampled_requests: list[SampleRequest] = []
+        ind = 0
 
         for item in self.data:
             if len(sampled_requests) >= num_requests:
@@ -1580,10 +1647,13 @@ def sample(
                     prompt=prompt_formatted,
                     prompt_len=prompt_len,
                     expected_output_len=expected_output_len,
+                    request_id=request_id_prefix + str(ind),
                 )
             )
+            ind += 1
 
-        self.maybe_oversample_requests(sampled_requests, num_requests)
+        self.maybe_oversample_requests(sampled_requests, num_requests, 
+                                       request_id_prefix)
         return sampled_requests
 
 
@@ -1616,6 +1686,7 @@ def sample(
         suffix_len: int = DEFAULT_SUFFIX_LEN,
         num_prefixes: int = DEFAULT_NUM_PREFIXES,
         output_len: int = DEFAULT_OUTPUT_LEN,
+        request_id_prefix: str = "",
         **kwargs,
     ) -> list[SampleRequest]:
         vocab_size = tokenizer.vocab_size
diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py
index 47bc28877450..677fe16cf5cc 100644
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -31,6 +31,7 @@ class RequestFuncInput:
     multi_modal_content: Optional[dict | list[dict]] = None
     ignore_eos: bool = False
     language: Optional[str] = None
+    request_id: Optional[str] = None
 
 
 @dataclass
@@ -87,6 +88,8 @@ async def async_request_openai_completions(
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
     }
+    if request_func_input.request_id:
+        headers["x-request-id"] = request_func_input.request_id
 
     output = RequestFuncOutput()
     output.prompt_len = request_func_input.prompt_len
@@ -210,6 +213,8 @@ async def async_request_openai_chat_completions(
         "Content-Type": "application/json",
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
     }
+    if request_func_input.request_id:
+        headers["x-request-id"] = request_func_input.request_id
 
     output = RequestFuncOutput()
     output.prompt_len = request_func_input.prompt_len
@@ -311,6 +316,8 @@ async def async_request_openai_audio(
     headers = {
         "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
     }
+    if request_func_input.request_id:
+        headers["x-request-id"] = request_func_input.request_id
 
     # Send audio file
     def to_bytes(y, sr):
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 7bf04c753241..79f2c475cbe5 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -478,11 +478,12 @@ async def limited_request_func(request_func_input, session, pbar):
                         "timestamp": timestamp
                     })
                 last_int_rps = current_int_rps
-        prompt, prompt_len, output_len, mm_content = (
+        prompt, prompt_len, output_len, mm_content, request_id = (
             request.prompt,
             request.prompt_len,
             request.expected_output_len,
             request.multi_modal_data,
+            request.request_id,
         )
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
@@ -498,7 +499,8 @@ async def limited_request_func(request_func_input, session, pbar):
                                               logprobs=logprobs,
                                               multi_modal_content=mm_content,
                                               ignore_eos=ignore_eos,
-                                              extra_body=extra_body)
+                                              extra_body=extra_body,
+                                              request_id=request_id,)
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -865,6 +867,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
         "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
     )
+    parser.add_argument(
+        "--request-id-prefix",
+        type=str,
+        required=False,
+        default="benchmark-serving",
+        help="Specify the prefix of request id.",
+    )
+
 
     sampling_group = parser.add_argument_group("sampling parameters")
     sampling_group.add_argument(

From 04450e0b973187fc21a3a4fdc73b6b396d10e2bb Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 19 Aug 2025 16:49:29 +0800
Subject: [PATCH 186/233] [Bugfix] Fix broken Minimax-01-VL model (#22116)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 examples/offline_inference/vision_language.py |  34 +++++
 tests/models/multimodal/test_tensor_schema.py |   1 -
 vllm/model_executor/models/minimax_vl_01.py   | 120 +++++++++++++-----
 3 files changed, 123 insertions(+), 32 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index a13b6a9225ae..9f6028d87cb2 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -889,6 +889,39 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
     return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
+def run_minimax_vl_01(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "MiniMaxAI/MiniMax-VL-01"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+        tensor_parallel_size=8,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": question}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Mistral-3 HF-format
 def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1539,6 +1572,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     "mantis": run_mantis,
     "minicpmo": run_minicpmo,
     "minicpmv": run_minicpmv,
+    "minimax_vl_01": run_minimax_vl_01,
     "mistral3": run_mistral3,
     "mllama": run_mllama,
     "molmo": run_molmo,
diff --git a/tests/models/multimodal/test_tensor_schema.py b/tests/models/multimodal/test_tensor_schema.py
index 51e5b84b6c08..143b4c8fc8c4 100644
--- a/tests/models/multimodal/test_tensor_schema.py
+++ b/tests/models/multimodal/test_tensor_schema.py
@@ -30,7 +30,6 @@
 
 ARCH_TO_SKIP = {
     "MolmoForCausalLM": "incompatible requirements",
-    "MiniMaxVL01ForConditionalGeneration": "broken model",
 }
 ARCH_NEEDS_EXTRAS = [
     "InternVLChatModel",
diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py
index 8107c6e8a04a..cc7db849a28b 100644
--- a/vllm/model_executor/models/minimax_vl_01.py
+++ b/vllm/model_executor/models/minimax_vl_01.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Mapping
-from typing import Literal, Optional, TypedDict, Union, cast
+from typing import Annotated, Literal, Optional, Union, cast
 
 import torch
 import torch.nn as nn
 from transformers import BatchFeature, PretrainedConfig
+from transformers.models.llava_next.modeling_llava_next import (
+    get_anyres_image_grid_shape, unpad_image)
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
@@ -17,6 +19,7 @@
 from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -29,24 +32,36 @@
                     maybe_prefix, merge_multimodal_embeddings)
 
 
-class MiniMaxVL01ImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: torch.Tensor
+class MiniMaxVL01ImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, num_channels, height, width)`
-
-    Note that `height` or `width` may be different per batch and image,
+    Dimensions:
+        - bn: Batch size * number of images
+        - np: Number of patches + 1
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+
+    Note that `num_patches` may be different per batch and image,
     in which case the data is passed as a list instead of a batched tensor.
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "np", 3, "h", "w", dynamic_dims={"np", "h", "w"})]
 
+    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
+    # This should be in `(height, width)` format.
 
-class MiniMaxVL01ImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
-    `hidden_size` must match the hidden size of language model backbone.
+class MiniMaxVL01ImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match language model backbone)
     """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", "ifs", "hs")]
 
 
 MiniMaxVL01ImageInputs = Union[MiniMaxVL01ImagePixelInputs,
@@ -141,6 +156,7 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return {
             "pixel_values": MultiModalFieldConfig.batched("image"),
+            "image_sizes": MultiModalFieldConfig.batched("image"),
             "image_embeds": MultiModalFieldConfig.batched("image"),
         }
 
@@ -239,7 +255,7 @@ def _image_pixels_to_features(
     ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        image_features = vision_tower(pixel_values)
+        image_features = tuple(vision_tower(p) for p in pixel_values)
 
         def select_features(leaf: torch.Tensor):
             return self._select_image_features(
@@ -252,6 +268,56 @@ def select_features(leaf: torch.Tensor):
             json_map_leaves(select_features, image_features),
         )
 
+    # adapted from https://huggingface.co/MiniMaxAI/MiniMax-VL-01/blob/main/modeling_minimax_vl_01.py#L616-L631
+    def pack_image_features(self, image_features: list[torch.Tensor],
+                            image_sizes: torch.Tensor):
+        new_image_features = []
+        for image_idx, image_feature in enumerate(image_features):
+            if image_feature.shape[0] > 1:
+                base_image_feature = image_feature[0]
+                image_feature = image_feature[1:]
+                height = width = (self.config.vision_config.image_size //
+                                  self.config.vision_config.patch_size)
+                if height * width != base_image_feature.shape[0]:
+                    raise ValueError(
+                        "The number of patches is not consistent with "
+                        "the image size.")
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    image_sizes[image_idx],
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+
+                image_feature = image_feature.view(num_patch_height,
+                                                   num_patch_width, height,
+                                                   width, -1)
+                image_feature = image_feature.permute(4, 0, 2, 1,
+                                                      3).contiguous()
+                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                image_feature = unpad_image(image_feature,
+                                            image_sizes[image_idx])
+
+                image_feature = torch.cat(
+                    (
+                        image_feature,
+                        self.image_newline[:, None, None].expand(
+                            *image_feature.shape[:-1], 1).to(
+                                image_feature.dtype),
+                    ),
+                    dim=-1,
+                )
+                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                image_feature = torch.cat((base_image_feature, image_feature),
+                                          dim=0)
+            else:
+                image_feature = image_feature[0]
+                image_feature = torch.cat(
+                    (image_feature,
+                     self.image_newline[None].to(image_feature)),
+                    dim=0)
+            new_image_features.append(image_feature)
+        return new_image_features
+
     def _process_image_pixels(
         self,
         inputs: MiniMaxVL01ImagePixelInputs,
@@ -259,7 +325,6 @@ def _process_image_pixels(
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
-
         return self._image_pixels_to_features(self.vision_tower, pixel_values)
 
     def _process_image_input(
@@ -281,38 +346,31 @@ def _process_image_input(
 
         image_embeds = self.multi_modal_projector(torch.cat(image_features))
         image_embeds = torch.split(image_embeds, feature_sizes)
-        return image_embeds
-
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
+        image_sizes = image_input.get("image_sizes")
+        return self.pack_image_features(image_embeds, image_sizes)
 
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[MiniMaxVL01ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is None and image_embeds is None:
             return None
 
-        if pixel_values is not None:
+        if pixel_values is not None and image_sizes is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            if not isinstance(image_sizes, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(image_sizes)}")
+
             return MiniMaxVL01ImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                pixel_values=flatten_bn(pixel_values),
+                image_sizes=flatten_bn(image_sizes, concat=True),
             )
 
         if image_embeds is not None:

From 66bfe25a7b3264cdb6c36e2778b4decc3ec81278 Mon Sep 17 00:00:00 2001
From: qizixi <22851944+zixi-qi@users.noreply.github.com>
Date: Tue, 19 Aug 2025 17:53:24 +0900
Subject: [PATCH 187/233] [bug fix] Fix llama4 spec decoding (#22691)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
---
 vllm/model_executor/models/llama4.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index 308cb3e85e27..ba08e6f81f7f 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -195,7 +195,9 @@ def __init__(self,
             is_neox_style=is_neox_style,
         ) if not self.nope else None
 
-        attn_cls = Attention if self.nope else ChunkedLocalAttention
+        use_chunked_local_attn = not self.nope and config.attention_chunk_size
+        attn_cls = (ChunkedLocalAttention
+                    if use_chunked_local_attn else Attention)
         self.attn = attn_cls(
             self.num_heads,
             self.head_dim,
@@ -206,7 +208,7 @@ def __init__(self,
             prefix=f"{prefix}.attn",
             **({
                 "attention_chunk_size": config.attention_chunk_size
-            } if not self.nope else {}))
+            } if use_chunked_local_attn else {}))
 
     def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
         floor = torch.floor((positions + 1.0) / self.floor_scale)

From c0e72a40b687375480817256c16f09250c1cead8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Aug 2025 02:39:38 -0700
Subject: [PATCH 188/233] [Misc] Avoid accessing req_ids inside a loop (#23159)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9b0345a6aa3a..634f955207fa 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1748,6 +1748,7 @@ def execute_model(
         # NOTE(woosuk): As an exception, when using PP, the scheduler sends
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
+        req_ids = self.input_batch.req_ids
         for req_idx, sampled_ids in enumerate(valid_sampled_token_ids):
             if not sampled_ids:
                 continue
@@ -1763,7 +1764,7 @@ def execute_model(
                                            start_idx:end_idx] = sampled_ids
             self.input_batch.num_tokens_no_spec[req_idx] = end_idx
             self.input_batch.num_tokens[req_idx] = end_idx
-            req_id = self.input_batch.req_ids[req_idx]
+            req_id = req_ids[req_idx]
             req_state = self.requests[req_id]
             req_state.output_token_ids.extend(sampled_ids)
 
@@ -1843,6 +1844,7 @@ def propose_draft_token_ids(
         elif self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
             # TODO(woosuk): Refactor the loop.
+            req_ids = self.input_batch.req_ids
             next_token_ids: list[int] = []
             for i, token_ids in enumerate(sampled_token_ids):
                 if token_ids:
@@ -1851,7 +1853,7 @@ def propose_draft_token_ids(
                 else:
                     # Partial prefill (rare case).
                     # Get the next token id from the request state.
-                    req_id = self.input_batch.req_ids[i]
+                    req_id = req_ids[i]
                     req_state = self.requests[req_id]
                     seq_len = (req_state.num_computed_tokens +
                                scheduler_output.num_scheduled_tokens[req_id])
@@ -1914,6 +1916,7 @@ def propose_ngram_draft_token_ids(
         sampled_token_ids: list[list[int]],
     ) -> list[list[int]]:
         # TODO(woosuk): Optimize.
+        req_ids = self.input_batch.req_ids
         draft_token_ids: list[list[int]] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
             num_sampled_ids = len(sampled_ids)
@@ -1924,7 +1927,7 @@ def propose_ngram_draft_token_ids(
 
             # Skip requests that require sampling parameters that are not
             # supported with speculative decoding.
-            req_id = self.input_batch.req_ids[i]
+            req_id = req_ids[i]
             if req_id in self.input_batch.spec_decode_unsupported_reqs:
                 draft_token_ids.append([])
                 continue

From 67c9cf8009ace147c07742f33a4e22ec2b660b53 Mon Sep 17 00:00:00 2001
From: Tialo <65392801+Tialo@users.noreply.github.com>
Date: Tue, 19 Aug 2025 13:16:23 +0300
Subject: [PATCH 189/233] [Doc] use power of 2 (#23172)

---
 docs/configuration/optimization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 2eeb8ad25de5..c7f50497d6ff 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -48,7 +48,7 @@ You can tune the performance by adjusting `max_num_batched_tokens`:
 
 - Smaller values (e.g., 2048) achieve better inter-token latency (ITL) because there are fewer prefills slowing down decodes.
 - Higher values achieve better time to first token (TTFT) as you can process more prefill tokens in a batch.
-- For optimal throughput, we recommend setting `max_num_batched_tokens > 8096` especially for smaller models on large GPUs.
+- For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs.
 - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
 
 ```python

From db4ed3e17807cefeeb8e3bc81c7bf068efb536c2 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Aug 2025 03:58:16 -0700
Subject: [PATCH 190/233] [Misc] Fix seq_lens for graph capture (#23175)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 634f955207fa..e0bab3367caf 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2317,15 +2317,13 @@ def _dummy_run(
 
         # If force_attention is True, we always capture attention. Otherwise,
         # it only happens for cudagraph_runtime_mode=FULL.
-        if force_attention or cudagraph_runtime_mode == \
-                CUDAGraphMode.FULL:
+        if force_attention or cudagraph_runtime_mode == CUDAGraphMode.FULL:
             attn_metadata = {}
 
             # Make sure max_model_len is used at the graph capture time.
             self.seq_lens_np[:num_reqs] = self.max_model_len
             self.seq_lens_np[num_reqs:] = 0
-            self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs],
-                                           non_blocking=True)
+            self.seq_lens.copy_(self.seq_lens_cpu, non_blocking=True)
 
             for kv_cache_group_id, kv_cache_group_spec in enumerate(
                     self.kv_cache_config.kv_cache_groups):

From a1ac4baf6e4f99187dc90984f667e9757c92033c Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Tue, 19 Aug 2025 20:22:15 +0800
Subject: [PATCH 191/233] [NVIDIA] Support Flashinfer TRTLLM FP8-q/kv/out
 Attention Kernel (#21716)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 .../benchmark_trtllm_decode_attention.py      | 283 +++++++++--------
 .../benchmark_trtllm_prefill_attention.py     | 229 +++++++------
 tests/compile/test_fusion_attn.py             | 249 ++++++++++++++-
 .../test_flashinfer_trtllm_attention.py       | 300 ++++++++++--------
 vllm/attention/layer.py                       |  11 +-
 vllm/compilation/fusion_attn.py               |  77 +++--
 vllm/utils/flashinfer.py                      |  22 +-
 vllm/v1/attention/backends/flashinfer.py      | 109 +++++--
 9 files changed, 849 insertions(+), 433 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0912bc1fd94f..d4fcb91b11b0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -631,6 +631,7 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/compilation/fusion.py
+  - vllm/compilation/fusion_attn.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
@@ -647,6 +648,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     # Fusion
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 77136edca45b..b3f81715461b 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -3,16 +3,14 @@
 
 import csv
 import os
-import random
 from datetime import datetime
+from typing import Optional
 
 import flashinfer
 import torch
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+FP8_DTYPE = torch.float8_e4m3fn
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -26,149 +24,168 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 
 @torch.no_grad()
 def benchmark_decode(
-    num_seqs,
-    max_seq_len,
-    page_size=16,
-    dtype=torch.bfloat16,
-    kv_layout="HND",
-    num_kv_heads=8,
-    kv_cache_dtype="auto",
-    head_dim=128,
-    warmup=10,
-    trials=20,
+    dtype: torch.dtype,
+    quant_dtypes: tuple[
+        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
+    ],
+    batch_size: int,
+    max_seq_len: int,
+    num_heads: tuple[int, int] = (64, 8),
+    head_size: int = 128,
+    kv_layout: str = "HND",
+    block_size: int = 16,
+    warmup: int = 10,
+    trials: int = 20,
 ):
     torch.set_default_device("cuda")
-    device = "cuda"
     torch.manual_seed(0)
 
-    HEAD_GRP_SIZE = 8
-    MAX_SEQ_LEN = max_seq_len
-
-    # large number to reduce kv_cache reuse
-    NUM_BLOCKS = int(256000 / page_size)
-
-    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device)
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
 
-    # For decode, batch_size is num_decode_token
-    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
-    sm_scale = float(1.0 / (head_dim**0.5))
-    q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype)
-    kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
 
-    max_kv_len = max(kv_lens)
-    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device)
-    max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size
+    sm_scale = float(1.0 / (head_size**0.5))
 
+    # large number to reduce kv_cache reuse
+    NUM_BLOCKS = int(256000 / block_size)
+
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+
+    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
+
+    kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32)
+    kv_lens[-1] = max_seq_len
+
+    seq_lens = kv_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
-        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
     )
-
-    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
-    kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype)
-    k_scale = v_scale = 1.0
-
-    if kv_cache_dtype.startswith("fp8"):
-        kv_cache, _ = to_float8(kv_cache)
-
-    output_trtllm = torch.empty(q.shape, dtype=dtype)
-
-    # Benchmark TRT decode
-    def trt_decode():
-        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
-            q,
-            kv_cache,
-            workspace_buffer,
-            block_tables,
-            kv_lens_tensor,
-            max_kv_len,
-            bmm1_scale=k_scale * sm_scale,
-            bmm2_scale=v_scale,
-            out=output_trtllm,
-        )
-
-    def time_fn(fn, warmup=10, trials=20):
-        torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
-        times = []
-        for i in range(warmup):
-            fn()
-        for i in range(trials):
-            start.record()
-            fn()
-            end.record()
-            torch.cuda.synchronize()
-            times.append(start.elapsed_time(end))  # ms
-        return sum(times) / len(times), torch.std(torch.tensor(times))
-
-    # TRT Decode
-    trt_mean, trt_std = time_fn(trt_decode)
-
     kv_indptr = [0]
     kv_indices = []
     kv_last_page_lens = []
-    for i in range(num_seqs):
-        seq_len = kv_lens[i]
+    for i in range(batch_size):
+        seq_len = seq_lens[i]
         assert seq_len > 0
-        num_blocks = (seq_len + page_size - 1) // page_size
+        num_blocks = (seq_len + block_size - 1) // block_size
         kv_indices.extend(block_tables[i, :num_blocks])
         kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % page_size
+        kv_last_page_len = seq_len % block_size
         if kv_last_page_len == 0:
-            kv_last_page_len = page_size
+            kv_last_page_len = block_size
         kv_last_page_lens.append(kv_last_page_len)
 
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
-    output_baseline = torch.empty(q.shape, dtype=dtype)
+    workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
 
     wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
         workspace_buffer,
         kv_layout,
         use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
     )
-
     wrapper.plan(
         kv_indptr,
         kv_indices,
         kv_last_page_lens,
         num_qo_heads,
         num_kv_heads,
-        head_dim,
-        page_size,
+        head_size,
+        block_size,
         "NONE",
+        sm_scale=sm_scale,
         q_data_type=dtype,
-        kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype,
+        kv_data_type=dtype,
     )
 
+    def time_fn(fn, warmup=10, trials=20):
+        torch.cuda.synchronize()
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        times = []
+        for i in range(warmup):
+            fn()
+        for i in range(trials):
+            start.record()
+            fn()
+            end.record()
+            torch.cuda.synchronize()
+            times.append(start.elapsed_time(end))  # ms
+        return sum(times) / len(times), torch.std(torch.tensor(times))
+
+    o_scale = 1.0
+    output_baseline = torch.empty(ref_query.shape, dtype=dtype)
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+
     def baseline_decode():
-        return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
+        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
+
+    def trtllm_decode():
+        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+            query=query,
+            kv_cache=kv_cache,
+            workspace_buffer=workspace_buffer,
+            block_tables=block_tables,
+            seq_lens=seq_lens,
+            max_seq_len=max_seq_len,
+            bmm1_scale=q_scale * k_scale * sm_scale,
+            bmm2_scale=v_scale / o_scale,
+            out=output_trtllm,
+        )
 
     baseline_mean, baseline_std = time_fn(baseline_decode)
+    trtllm_mean, trtllm_std = time_fn(trtllm_decode)
 
     # Calculate percentage speedup (positive means TRT is faster)
-    speedup_percent = (baseline_mean - trt_mean) / baseline_mean
+    speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
 
     print(
-        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}"
+        f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:.3f}\t{trtllm_std.item():.3f}"
         f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
     )
 
     # Return results for CSV writing
     return {
-        "num_seqs": num_seqs,
-        "trt_mean": trt_mean,
-        "trt_std": trt_std.item(),
+        "batch_size": batch_size,
+        "trtllm_mean": trtllm_mean,
+        "trtllm_std": trtllm_std.item(),
         "baseline_mean": baseline_mean,
         "baseline_std": baseline_std.item(),
         "speedup_percent": speedup_percent,
-        "q_dtype": str(dtype),
-        "kv_cache_dtype": kv_cache_dtype,
-        "page_size": page_size,
+        "q_dtype": str(q_quant_dtype),
+        "kv_cache_dtype": str(kv_quant_dtype),
+        "output_dtype": str(o_quant_dtype),
+        "block_size": block_size,
         "num_kv_heads": num_kv_heads,
-        "head_dim": head_dim,
+        "head_size": head_size,
         "max_seq_len": max_seq_len,
     }
 
@@ -180,17 +197,18 @@ def write_results_to_csv(results, filename=None):
         filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
 
     fieldnames = [
-        "num_seqs",
-        "trt_mean",
-        "trt_std",
+        "batch_size",
+        "trtllm_mean",
+        "trtllm_std",
         "baseline_mean",
         "baseline_std",
         "speedup_percent",
         "q_dtype",
         "kv_cache_dtype",
-        "page_size",
+        "output_dtype",
+        "block_size",
         "num_kv_heads",
-        "head_dim",
+        "head_size",
         "max_seq_len",
     ]
 
@@ -209,45 +227,42 @@ def write_results_to_csv(results, filename=None):
 
 
 if __name__ == "__main__":
-    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
+    batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
     max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     all_results = []
 
-    print(
-        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
-        "output_dtype: bfloat16"
-    )
-    print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
-        "baseline_std\tspeedup_percent"
-    )
-    for max_seq_len in max_seq_lens:
-        for bs in num_seqs:
-            result = benchmark_decode(
-                bs,
-                max_seq_len,
-                dtype=torch.bfloat16,
-                kv_cache_dtype="auto",
-            )
-            all_results.append(result)
+    dtype = torch.bfloat16
+    quant_dtypes = [
+        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+        (None, None, None),
+        (None, FP8_DTYPE, None),
+        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+    ]
 
-    print(
-        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
-        "output_dtype: bfloat16"
-    )
-    print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
-        "baseline_std\tspeedup_percent"
-    )
-    for max_seq_len in max_seq_lens:
-        for bs in num_seqs:
-            result = benchmark_decode(
-                bs,
-                max_seq_len,
-                dtype=torch.bfloat16,
-                kv_cache_dtype="fp8",
-            )
-            all_results.append(result)
+    for quant_dtype in quant_dtypes:
+        q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
+        q_quant_dtype = q_quant_dtype or dtype
+        kv_quant_dtype = kv_quant_dtype or dtype
+        o_quant_dtype = o_quant_dtype or dtype
+
+        print(
+            f"Running benchmark for q_dtype = {q_quant_dtype}, "
+            f"kv_cache_dtype: {kv_quant_dtype}, "
+            f"output_dtype: {o_quant_dtype}"
+        )
+        print(
+            "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
+            "baseline_std\tspeedup_percent"
+        )
+        for max_seq_len in max_seq_lens:
+            for bs in batch_sizes:
+                result = benchmark_decode(
+                    dtype=dtype,
+                    quant_dtypes=quant_dtype,
+                    batch_size=bs,
+                    max_seq_len=max_seq_len,
+                )
+                all_results.append(result)
 
     # Write all results to CSV
     write_results_to_csv(all_results)
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 67bd9aebbcca..49810e20c7d8 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -3,16 +3,14 @@
 
 import csv
 import os
-import random
 from datetime import datetime
+from typing import Optional
 
 import flashinfer
 import torch
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
+FP8_DTYPE = torch.float8_e4m3fn
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -26,84 +24,99 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 
 @torch.no_grad()
 def benchmark_prefill(
-    num_seqs,
-    max_seq_len,
-    page_size=16,
-    dtype=torch.bfloat16,
-    kv_layout="HND",
-    num_kv_heads=8,
-    kv_cache_dtype="auto",
-    head_dim=128,
-    warmup=10,
-    trials=20,
+    dtype: torch.dtype,
+    quant_dtypes: tuple[
+        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
+    ],
+    batch_size: int,
+    max_seq_len: int,
+    num_heads: tuple[int, int] = (64, 8),
+    head_size: int = 128,
+    kv_layout: str = "HND",
+    block_size: int = 16,
+    warmup: int = 10,
+    trials: int = 20,
 ):
     torch.set_default_device("cuda")
     torch.manual_seed(0)
 
-    HEAD_GRP_SIZE = 8
-    MAX_SEQ_LEN = max_seq_len
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
 
-    # large number to reduce kv_cache reuse
-    NUM_BLOCKS = int(256000 / page_size)
+    max_q_len = max_kv_len = max_seq_len
 
-    workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8)
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
 
-    num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
-    sm_scale = float(1.0 / (head_dim**0.5))
+    sm_scale = float(1.0 / (head_size**0.5))
 
-    q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
-    q_lens[-1] = MAX_SEQ_LEN
-    max_q_len = max(q_lens)
+    # large number to reduce kv_cache reuse
+    NUM_BLOCKS = int(256000 / block_size)
+
+    kv_cache_shape = None
+    if kv_layout == "NHD":
+        kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
+    elif kv_layout == "HND":
+        kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
+    else:
+        raise ValueError(f"Invalid kv_layout: {kv_layout}")
+
+    q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32)
+    q_lens[-1] = max_q_len
     q_indptr = torch.cat(
         [
             torch.tensor([0], dtype=torch.int32),
-            torch.cumsum(
-                torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32
-            ),
+            torch.cumsum(q_lens, dim=0, dtype=torch.int32),
         ]
     )
-    q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype)
 
-    kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)]
-    kv_lens[-1] = MAX_SEQ_LEN
-
-    seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)]
-    max_seq_len = max(seq_lens)
-    seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
-
-    max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
+    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
+
+    kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32)
+    kv_lens[-1] = max_kv_len
+
+    seq_lens = kv_lens + q_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
-        0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
+        0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
     )
-
-    kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
-    kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype)
-    k_scale = v_scale = 1.0
-
-    if kv_cache_dtype.startswith("fp8"):
-        kv_cache, _ = to_float8(kv_cache)
-
-    output_trtllm = torch.empty(q.shape, dtype=dtype)
-
     kv_indptr = [0]
     kv_indices = []
     kv_last_page_lens = []
-    for i in range(num_seqs):
+    for i in range(batch_size):
         seq_len = seq_lens[i]
         assert seq_len > 0
-        num_blocks = (seq_len + page_size - 1) // page_size
+        num_blocks = (seq_len + block_size - 1) // block_size
         kv_indices.extend(block_tables[i, :num_blocks])
         kv_indptr.append(kv_indptr[-1] + num_blocks)
-        kv_last_page_len = seq_len % page_size
+        kv_last_page_len = seq_len % block_size
         if kv_last_page_len == 0:
-            kv_last_page_len = page_size
+            kv_last_page_len = block_size
         kv_last_page_lens.append(kv_last_page_len)
 
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
-    output_baseline = torch.empty(q.shape, dtype=dtype)
+    workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
 
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, kv_layout
@@ -115,12 +128,12 @@ def benchmark_prefill(
         kv_last_page_lens,
         num_qo_heads,
         num_kv_heads,
-        head_dim,
-        page_size,
+        head_size,
+        block_size,
         causal=True,
         sm_scale=sm_scale,
         q_data_type=dtype,
-        kv_data_type=kv_cache.dtype,
+        kv_data_type=dtype,
     )
 
     def time_fn(fn, warmup=10, trials=20):
@@ -138,52 +151,55 @@ def time_fn(fn, warmup=10, trials=20):
             times.append(start.elapsed_time(end))  # ms
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
+    o_scale = 1.0
+    output_baseline = torch.empty(ref_query.shape, dtype=dtype)
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+
     def baseline_prefill():
-        return wrapper.run(
-            q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline
-        )
+        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
 
-    def trt_prefill():
+    def trtllm_prefill():
         return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
-            query=q,
+            query=query,
             kv_cache=kv_cache,
             workspace_buffer=workspace_buffer,
             block_tables=block_tables,
-            seq_lens=seq_lens_tensor,
+            seq_lens=seq_lens,
             max_q_len=max_q_len,
             max_kv_len=max_seq_len,
-            bmm1_scale=k_scale * sm_scale,
-            bmm2_scale=v_scale,
-            batch_size=num_seqs,
+            bmm1_scale=q_scale * k_scale * sm_scale,
+            bmm2_scale=v_scale / o_scale,
+            batch_size=batch_size,
             cum_seq_lens_q=q_indptr,
             cum_seq_lens_kv=kv_indptr,
             out=output_trtllm,
         )
 
-    trt_mean, trt_std = time_fn(trt_prefill)
     baseline_mean, baseline_std = time_fn(baseline_prefill)
+    trtllm_mean, trtllm_std = time_fn(trtllm_prefill)
 
     # Calculate percentage speedup (positive means TRT is faster)
-    speedup_percent = (baseline_mean - trt_mean) / baseline_mean
+    speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
 
     print(
-        f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}"
-        f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}"
+        f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:8.3f}\t{trtllm_std.item():8.3f}"
+        f"\t{baseline_mean:8.3f}\t{baseline_std.item():8.3f}\t{speedup_percent:8.3f}"
     )
 
     # Return results for CSV writing
     return {
-        "num_seqs": num_seqs,
-        "trt_mean": trt_mean,
-        "trt_std": trt_std.item(),
+        "batch_size": batch_size,
+        "trtllm_mean": trtllm_mean,
+        "trtllm_std": trtllm_std.item(),
         "baseline_mean": baseline_mean,
         "baseline_std": baseline_std.item(),
         "speedup_percent": speedup_percent,
-        "q_dtype": str(dtype),
-        "kv_cache_dtype": kv_cache_dtype,
-        "page_size": page_size,
+        "q_dtype": str(q_quant_dtype),
+        "kv_cache_dtype": str(kv_quant_dtype),
+        "output_dtype": str(o_quant_dtype),
+        "block_size": block_size,
         "num_kv_heads": num_kv_heads,
-        "head_dim": head_dim,
+        "head_size": head_size,
         "max_seq_len": max_seq_len,
     }
 
@@ -195,17 +211,18 @@ def write_results_to_csv(results, filename=None):
         filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
 
     fieldnames = [
-        "num_seqs",
-        "trt_mean",
-        "trt_std",
+        "batch_size",
+        "trtllm_mean",
+        "trtllm_std",
         "baseline_mean",
         "baseline_std",
         "speedup_percent",
         "q_dtype",
         "kv_cache_dtype",
-        "page_size",
+        "output_dtype",
+        "block_size",
         "num_kv_heads",
-        "head_dim",
+        "head_size",
         "max_seq_len",
     ]
 
@@ -224,27 +241,41 @@ def write_results_to_csv(results, filename=None):
 
 
 if __name__ == "__main__":
-    num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
+    batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
     max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     all_results = []
 
-    print(
-        "Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
-        "output_dtype: bfloat16"
-    )
-    print(
-        "\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
-        "baseline_std\tspeedup_percent"
-    )
-    for max_seq_len in max_seq_lens:
-        for bs in num_seqs:
-            result = benchmark_prefill(
-                bs,
-                max_seq_len,
-                dtype=torch.bfloat16,
-                kv_cache_dtype="auto",
-            )
-            all_results.append(result)
+    dtype = torch.bfloat16
+    quant_dtypes = [
+        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+        (None, None, None),
+        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+    ]
+
+    for quant_dtype in quant_dtypes:
+        q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
+        q_quant_dtype = q_quant_dtype or dtype
+        kv_quant_dtype = kv_quant_dtype or dtype
+        o_quant_dtype = o_quant_dtype or dtype
+
+        print(
+            f"Running benchmark for q_dtype = {q_quant_dtype}, "
+            f"kv_cache_dtype: {kv_quant_dtype}, "
+            f"output_dtype: {o_quant_dtype}"
+        )
+        print(
+            "\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
+            "baseline_std\tspeedup_percent"
+        )
+        for max_seq_len in max_seq_lens:
+            for bs in batch_sizes:
+                result = benchmark_prefill(
+                    dtype=dtype,
+                    quant_dtypes=quant_dtype,
+                    batch_size=bs,
+                    max_seq_len=max_seq_len,
+                )
+                all_results.append(result)
 
     # Write all results to CSV
     write_results_to_csv(all_results)
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index 70750eb9ac4e..bef0fdef985e 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 from typing import Optional
 
 import pytest
@@ -7,13 +8,27 @@
 
 from tests.compile.backend import TestBackend
 from tests.models.utils import check_outputs_equal
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata)
 from vllm import LLM, SamplingParams
+from vllm.attention import Attention
+from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion import QUANT_OPS, QuantKey, kFp8StaticTensorSym
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
+                         ModelConfig, PassConfig, SchedulerConfig, VllmConfig,
+                         set_current_vllm_config)
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    Fp8LinearOp)
 from vllm.platforms import current_platform
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+FP8_DTYPE = current_platform.fp8_dtype()
 
 # globals needed for string-import custom Dynamo backend field
 backend: Optional[TestBackend] = None
@@ -132,3 +147,235 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
 
     # Reset backend to make sure llm2 gets released
     backend = None
+
+
+class TestAttentionStaticQuantPatternModel(torch.nn.Module):
+    """Test model for AttentionStaticQuantPattern fusion."""
+
+    def __init__(self, num_qo_heads: int, num_kv_heads: int, head_size: int,
+                 kv_cache_dtype: torch.dtype, device: torch.device,
+                 vllm_config: VllmConfig):
+        super().__init__()
+        self.num_qo_heads = num_qo_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_size = head_size
+        self.kv_cache_dtype = kv_cache_dtype
+        self.device = device
+        self.vllm_config = vllm_config
+
+        self.attn = Attention(
+            num_heads=self.num_qo_heads,
+            head_size=self.head_size,
+            scale=1.0 / (self.head_size**0.5),
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            prefix="model.layers.0.self_attn.attn",
+        )
+
+        self.fp8_linear = Fp8LinearOp(
+            act_quant_static=True, act_quant_group_shape=GroupShape.PER_TENSOR)
+        self.wscale = torch.tensor([1.0], dtype=torch.float32)
+        self.scale = torch.tensor([1.0], dtype=torch.float32)
+
+        self.block_size = 16
+
+        # Initialize attn MetadataBuilder
+        self.builder = self.attn.attn_backend.get_builder_cls()(
+            kv_cache_spec=AttentionSpec(
+                block_size=self.block_size,
+                num_kv_heads=self.num_kv_heads,
+                head_size=self.head_size,
+                dtype=self.kv_cache_dtype,
+                use_mla=False,
+            ),
+            layer_names=[self.attn.layer_name],
+            vllm_config=self.vllm_config,
+            device=self.device,
+        )
+
+    def build_attn_metadata(self, batch_size: int):
+        """Initialize attention metadata."""
+
+        # Create common attn metadata
+        batch_spec = BatchSpec(seq_lens=[1] * batch_size,
+                               query_lens=[1] * batch_size)
+        common_attn_metadata = create_common_attn_metadata(
+            batch_spec,
+            self.block_size,
+            self.device,
+            arange_block_indices=True)
+
+        max_blocks = (max(batch_spec.seq_lens) + self.block_size -
+                      1) // self.block_size
+        num_blocks = batch_size * max_blocks
+
+        # Create dummy KV cache for FlashInfer TRTLLM
+        #   - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+        #   - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
+        # Create kv_cache in HND layout and permute to NHD layout
+        # (later will be permuted back to HND layout in forward pass)
+        kv_cache = torch.zeros(num_blocks,
+                               2,
+                               self.num_kv_heads,
+                               self.block_size,
+                               self.head_size,
+                               dtype=self.kv_cache_dtype,
+                               device=self.device)
+        kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
+        self.attn.kv_cache = [kv_cache]
+
+        # Build attn metadata
+        self.attn_metadata = self.builder.build(
+            common_prefix_len=0, common_attn_metadata=common_attn_metadata)
+
+        return self.attn_metadata
+
+    def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                w: torch.Tensor):
+        """Forward pass that creates the pattern to be fused."""
+        attn_output = self.attn(q, k, v)
+        return self.fp8_linear.apply(input=attn_output,
+                                     weight=w,
+                                     weight_scale=self.wscale,
+                                     input_scale=self.scale)
+
+
+@pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)])
+@pytest.mark.parametrize("head_size", [128])
+@pytest.mark.parametrize("batch_size", [7, 256, 533])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize(
+    "model_name, quant_key",
+    [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", kFp8StaticTensorSym)])
+@pytest.mark.parametrize("backend", [_Backend.FLASHINFER])
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
+@pytest.mark.skipif(not current_platform.is_device_capability((10, 0)),
+                    reason="Only test on SM100(Blackwell)")
+def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
+                                 head_size: int, batch_size: int,
+                                 dtype: torch.dtype, model_name: str,
+                                 quant_key: QuantKey, backend: _Backend,
+                                 monkeypatch, dist_init):
+    """Test AttentionStaticQuantPattern fusion pass"""
+
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    device = torch.device("cuda:0")
+    torch.manual_seed(42)
+
+    vllm_config = VllmConfig(
+        model_config=ModelConfig(
+            model=model_name,
+            max_model_len=2048,
+        ),
+        scheduler_config=SchedulerConfig(max_num_seqs=1024),
+        compilation_config=CompilationConfig(
+            level=CompilationLevel.PIECEWISE,
+            custom_ops=["+quant_fp8"],
+        ),
+        cache_config=CacheConfig(cache_dtype="fp8"))
+
+    # Create test inputs
+    hidden_size = num_qo_heads * head_size
+    q = torch.randn(batch_size, hidden_size, dtype=dtype, device=device)
+    k = torch.randn(batch_size,
+                    num_kv_heads * head_size,
+                    dtype=dtype,
+                    device=device)
+    v = torch.randn(batch_size,
+                    num_kv_heads * head_size,
+                    dtype=dtype,
+                    device=device)
+    linear_w = torch.randn(hidden_size, hidden_size).to(FP8_DTYPE).t()
+
+    # Mark first dimension as dynamic for realistic testing
+    torch._dynamo.mark_dynamic(q, 0)
+    torch._dynamo.mark_dynamic(k, 0)
+    torch._dynamo.mark_dynamic(v, 0)
+
+    # Run model directly without compilation and fusion
+    vllm_config_unfused = copy.deepcopy(vllm_config)
+    with set_current_vllm_config(vllm_config_unfused), set_forward_context(
+            attn_metadata=None, vllm_config=vllm_config_unfused
+    ), global_force_attn_backend_context_manager(backend):
+        model_unfused = TestAttentionStaticQuantPatternModel(
+            num_qo_heads, num_kv_heads, head_size, FP8_DTYPE, device,
+            vllm_config_unfused)
+        model_unfused = model_unfused.to(device)
+
+        forward_ctx = get_forward_context()
+        forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
+            batch_size)
+
+        # Run model directly without compilation and fusion
+        result_unfused = model_unfused(q, k, v, linear_w)
+
+    # Run model with attn fusion enabled
+    vllm_config.compilation_config.pass_config = PassConfig(
+        enable_attn_fusion=True, enable_noop=True)
+    with set_current_vllm_config(vllm_config), set_forward_context(
+            attn_metadata=None, vllm_config=vllm_config
+    ), global_force_attn_backend_context_manager(backend):
+        model_fused = TestAttentionStaticQuantPatternModel(
+            num_qo_heads, num_kv_heads, head_size, FP8_DTYPE, device,
+            vllm_config)
+        model_fused = model_fused.to(device)
+
+        forward_ctx = get_forward_context()
+        forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
+
+        # Create test backend with fusion passes enabled
+        noop_pass = NoOpEliminationPass(vllm_config)
+        attn_pass = lambda *args, **kw: AttnFusionPass(vllm_config)(*args, **kw
+                                                                    )
+        test_backend = TestBackend(noop_pass, attn_pass)
+
+        # Compile model with fusion enabled
+        model_compiled = torch.compile(model_fused,
+                                       backend=test_backend,
+                                       fullgraph=True)
+        assert model_compiled.attn._o_scale_float is None
+        result_fused_1 = model_compiled(q, k, v, linear_w)
+
+        # After the 1st round of the forward pass, output quant scale should be
+        # loaded into the attn layer's _o_scale_float, the 2nd round should
+        # reuse the loaded _o_scale_float
+        assert model_compiled.attn._o_scale_float is not None
+        result_fused_2 = model_compiled(q, k, v, linear_w)
+        assert model_compiled.attn._o_scale_float is not None
+
+    # Check attn fusion support
+    attn_fusion_supported = [
+        layer.impl.fused_output_quant_supported(quant_key.dtype,
+                                                quant_key.static,
+                                                quant_key.group_shape) for key,
+        layer in vllm_config.compilation_config.static_forward_context.items()
+    ]
+    if any(attn_fusion_supported):
+        # Check quantization ops in the graph before and after fusion
+        test_backend.check_before_ops([QUANT_OPS[quant_key]],
+                                      fully_replaced=True)
+
+    # Check attention ops in the graph before and after fusion
+    attn_nodes_pre = list(find_op_nodes(ATTN_OP, test_backend.graph_pre_pass))
+    attn_nodes_post = list(find_op_nodes(ATTN_OP,
+                                         test_backend.graph_post_pass))
+
+    assert len(attn_nodes_pre) > 0, "Should have attention nodes before fusion"
+    assert len(attn_nodes_pre) == len(attn_nodes_post), \
+        "Should have same number of attention nodes before and after fusion"
+    assert attn_nodes_pre[0].kwargs.get("output_scale") is None, \
+        "Attention should not have output_scale before fusion"
+    assert attn_nodes_post[0].kwargs.get("output_scale") is not None, \
+        "Attention should have output_scale after fusion"
+
+    # Check that results are closed
+    torch.testing.assert_close(result_unfused,
+                               result_fused_1,
+                               atol=1e-2,
+                               rtol=1e-2)
+    torch.testing.assert_close(result_unfused,
+                               result_fused_2,
+                               atol=1e-2,
+                               rtol=1e-2)
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 4b84e6a00ece..619822f3ee43 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -13,21 +13,7 @@
                 allow_module_level=True)
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
-
-# KV Cache Layout for TRT-LLM
-# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
-
-MAX_Q_LEN = 1024
-MAX_KV_LEN = 4096
-BATCH_SIZES = [4, 12]
-NUM_HEADS = [(16, 16), (40, 8)]
-HEAD_SIZES = [128]
-BLOCK_SIZES = [16]
-KV_LAYOUTS = ["HND"]
-DTYPES = [torch.bfloat16]
-KV_CACHE_DTYPES = [None, current_platform.fp8_dtype()]
-NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
-SOFT_CAPS = [None, 50.0]
+FP8_DTYPE = current_platform.fp8_dtype()
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -39,42 +25,59 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
     return x_scl_sat.to(dtype), scale.float().reciprocal()
 
 
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+DTYPE = [torch.bfloat16]
+QUANT_DTYPES = [
+    # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
+    (None, None, None),
+    (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+]
+BATCH_SIZE = [4, 12]
+MAX_SEQ_LENS = [(1024, 4096)]
+NUM_HEADS = [(64, 8), (40, 8)]
+HEAD_SIZE = [128]
+KV_LAYOUT = ["HND"]  # currently only HND is supported
+BLOCK_SIZE = [16]
+SOFT_CAP = [None, 50.0]
+
+NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+
+
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("quant_dtypes", QUANT_DTYPES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("max_seq_lens", MAX_SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("kv_layout", KV_LAYOUTS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
-@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("head_size", HEAD_SIZE)
+@pytest.mark.parametrize("kv_layout", KV_LAYOUT)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("soft_cap", SOFT_CAP)
 @torch.inference_mode
 def test_flashinfer_trtllm_decode_with_baseline(
+    dtype: torch.dtype,
+    quant_dtypes: tuple[Optional[torch.dtype], Optional[torch.dtype],
+                        Optional[torch.dtype]],
     batch_size: int,
+    max_seq_lens: tuple[int, int],
     num_heads: tuple[int, int],
     head_size: int,
-    block_size: int,
     kv_layout: str,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[torch.dtype],
+    block_size: int,
     soft_cap: Optional[float],
 ) -> None:
-    kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
-
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
 
-    kv_lens = torch.randint(1, MAX_KV_LEN, (batch_size, ), dtype=torch.int32)
-    kv_lens[-1] = MAX_KV_LEN
-    max_kv_len = torch.max(kv_lens).item()
-    num_seqs = len(kv_lens)
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
 
-    num_query_heads = num_heads[0]
-    num_kv_heads = num_heads[1]
-    assert num_query_heads % num_kv_heads == 0
+    _, max_kv_len = max_seq_lens
 
-    scale = head_size**-0.5
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
 
-    query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype)
+    sm_scale = float(1.0 / (head_size**0.5))
 
     kv_cache_shape = None
     if kv_layout == "NHD":
@@ -83,23 +86,40 @@ def test_flashinfer_trtllm_decode_with_baseline(
         kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
     else:
         raise ValueError(f"Invalid kv_layout: {kv_layout}")
-    key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
-    kv_scale = 1.0
-    if kv_cache_dtype is current_platform.fp8_dtype():
-        key_value_cache, kv_scale = to_float8(key_value_cache,
-                                              current_platform.fp8_dtype())
 
-    max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size
+    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
+
+    kv_lens = torch.randint(1, max_kv_len, (batch_size, ), dtype=torch.int32)
+    kv_lens[-1] = max_kv_len
+
+    seq_lens = kv_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
+
+    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(0,
                                  NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
+                                 (batch_size, max_num_blocks_per_seq),
                                  dtype=torch.int32)
-    k_scale = v_scale = kv_scale
     kv_indptr = [0]
     kv_indices = []
     kv_last_page_lens = []
-    for i in range(num_seqs):
-        seq_len = kv_lens[i]
+    for i in range(batch_size):
+        seq_len = seq_lens[i]
         assert seq_len > 0
         num_blocks = (seq_len + block_size - 1) // block_size
         kv_indices.extend(block_tables[i, :num_blocks])
@@ -112,103 +132,93 @@ def test_flashinfer_trtllm_decode_with_baseline(
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
+
+    # Baseline Decode
     wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
         workspace_buffer,
         kv_layout,
-        use_tensor_cores=((num_query_heads // num_kv_heads) > 4))
+        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4))
     wrapper.plan(kv_indptr,
                  kv_indices,
                  kv_last_page_lens,
-                 num_query_heads,
+                 num_qo_heads,
                  num_kv_heads,
                  head_size,
                  block_size,
                  "NONE",
-                 sm_scale=scale,
+                 sm_scale=sm_scale,
                  q_data_type=dtype,
-                 kv_data_type=kv_cache_dtype,
+                 kv_data_type=dtype,
                  logits_soft_cap=soft_cap)
 
-    output = torch.empty(query.shape, dtype=dtype)
-    wrapper.run(query,
-                key_value_cache,
-                k_scale=k_scale,
-                v_scale=v_scale,
-                out=output)
+    output = torch.empty(ref_query.shape, dtype=dtype)
+    wrapper.run(ref_query, ref_kv_cache, out=output)
+    o_scale = 1.0
+    if o_quant_dtype == FP8_DTYPE:
+        _, o_scale = to_float8(output)
 
     # TRTLLM Decode
-    kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32)
-    output_trtllm = torch.empty(query.shape, dtype=dtype)
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
     flashinfer.decode.trtllm_batch_decode_with_kv_cache(
-        query=query.contiguous(),
-        kv_cache=key_value_cache,
+        query=query,
+        kv_cache=kv_cache,
         workspace_buffer=workspace_buffer,
         block_tables=block_tables,
-        seq_lens=kv_lens_tensor,
-        max_seq_len=max_kv_len,
-        bmm1_scale=k_scale * scale,
-        bmm2_scale=v_scale,
+        seq_lens=seq_lens,
+        max_seq_len=max_seq_len,
+        bmm1_scale=q_scale * k_scale * sm_scale,
+        bmm2_scale=v_scale / o_scale,
         out=output_trtllm,
     )
+    if o_quant_dtype == FP8_DTYPE:
+        output_trtllm = output_trtllm.to(dtype) * o_scale
 
-    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
+    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
+        rtol, atol = 5e-2, 7e-2
+    else:
+        rtol, atol = 1e-2, 1e-2
+
+    torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), \
         f"{torch.max(torch.abs(output - output_trtllm))}"
 
 
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("dtype", DTYPE)
+@pytest.mark.parametrize("quant_dtypes", QUANT_DTYPES)
+@pytest.mark.parametrize("batch_size", BATCH_SIZE)
+@pytest.mark.parametrize("max_seq_lens", MAX_SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("kv_layout", KV_LAYOUTS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
+@pytest.mark.parametrize("head_size", HEAD_SIZE)
+@pytest.mark.parametrize("kv_layout", KV_LAYOUT)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
 @pytest.mark.parametrize("soft_cap", [None])
 @torch.inference_mode
 def test_flashinfer_trtllm_prefill_with_baseline(
+    dtype: torch.dtype,
+    quant_dtypes: tuple[Optional[torch.dtype], Optional[torch.dtype],
+                        Optional[torch.dtype]],
     batch_size: int,
+    max_seq_lens: tuple[int, int],
     num_heads: tuple[int, int],
     head_size: int,
-    block_size: int,
     kv_layout: str,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[torch.dtype],
+    block_size: int,
     soft_cap: Optional[float],
 ) -> None:
-    kv_cache_dtype = dtype if kv_cache_dtype is None else kv_cache_dtype
-    if dtype != kv_cache_dtype:
-        pytest.skip(f"Not supported dtype({dtype}) with "
-                    "kv_cache_dtype({kv_cache_dtype})")
-
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
 
-    q_lens = torch.randint(1, MAX_Q_LEN, (batch_size, ), dtype=torch.int32)
-    q_lens[-1] = MAX_Q_LEN
-    max_q_len = torch.max(q_lens).item()
-    q_indptr = torch.cat([
-        torch.tensor([0], dtype=torch.int32),
-        torch.cumsum(q_lens, dim=0, dtype=torch.int32),
-    ])
-
-    kv_lens = torch.randint(0, MAX_KV_LEN, (batch_size, ), dtype=torch.int32)
-    kv_lens[-1] = MAX_KV_LEN
+    q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
+    q_quant_dtype = q_quant_dtype or dtype
+    kv_quant_dtype = kv_quant_dtype or dtype
+    o_quant_dtype = o_quant_dtype or dtype
 
-    seq_lens = kv_lens + q_lens
-    max_seq_len = torch.max(seq_lens).item()
-    num_seqs = len(seq_lens)
+    max_q_len, max_kv_len = max_seq_lens
 
-    num_query_heads = num_heads[0]
-    num_kv_heads = num_heads[1]
-    assert num_query_heads % num_kv_heads == 0
+    num_qo_heads, num_kv_heads = num_heads
+    assert num_qo_heads % num_kv_heads == 0
 
-    scale = head_size**-0.5
-
-    query = torch.randn(torch.sum(q_lens).item(),
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype)
+    sm_scale = float(1.0 / (head_size**0.5))
 
     kv_cache_shape = None
     if kv_layout == "NHD":
@@ -217,22 +227,49 @@ def test_flashinfer_trtllm_prefill_with_baseline(
         kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
     else:
         raise ValueError(f"Invalid kv_layout: {kv_layout}")
-    key_value_cache = torch.randn(kv_cache_shape, dtype=dtype)
-    kv_scale = 1.0
-    if kv_cache_dtype is current_platform.fp8_dtype():
-        key_value_cache, kv_scale = to_float8(key_value_cache,
-                                              current_platform.fp8_dtype())
+
+    q_lens = torch.randint(1, max_q_len, (batch_size, ), dtype=torch.int32)
+    q_lens[-1] = max_q_len
+    q_indptr = torch.cat([
+        torch.tensor([0], dtype=torch.int32),
+        torch.cumsum(q_lens, dim=0, dtype=torch.int32),
+    ])
+
+    query = torch.randn(torch.sum(q_lens).item(),
+                        num_qo_heads,
+                        head_size,
+                        dtype=dtype)
+    if q_quant_dtype == FP8_DTYPE:
+        query, q_scale = to_float8(query)
+        ref_query = query.to(dtype) * q_scale
+    else:
+        q_scale = 1.0
+        ref_query = query
+
+    kv_lens = torch.randint(0, max_kv_len, (batch_size, ), dtype=torch.int32)
+    kv_lens[-1] = max_kv_len
+
+    seq_lens = kv_lens + q_lens
+    max_seq_len = torch.max(seq_lens).item()
+
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    if kv_quant_dtype == FP8_DTYPE:
+        kv_cache, kv_scale = to_float8(kv_cache)
+        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+    else:
+        kv_scale = 1.0
+        ref_kv_cache = kv_cache
+    k_scale = v_scale = kv_scale
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(0,
                                  NUM_BLOCKS,
-                                 (num_seqs, max_num_blocks_per_seq),
+                                 (batch_size, max_num_blocks_per_seq),
                                  dtype=torch.int32)
-    k_scale = v_scale = kv_scale
     kv_indptr = [0]
     kv_indices = []
     kv_last_page_lens = []
-    for i in range(num_seqs):
+    for i in range(batch_size):
         seq_len = seq_lens[i]
         assert seq_len > 0
         num_blocks = (seq_len + block_size - 1) // block_size
@@ -246,48 +283,55 @@ def test_flashinfer_trtllm_prefill_with_baseline(
     kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
     kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
     kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
-
     workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8)
+
+    # Baseline Prefill
     wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
         workspace_buffer, kv_layout)
     wrapper.plan(q_indptr,
                  kv_indptr,
                  kv_indices,
                  kv_last_page_lens,
-                 num_query_heads,
+                 num_qo_heads,
                  num_kv_heads,
                  head_size,
                  block_size,
                  causal=True,
-                 sm_scale=scale,
+                 sm_scale=sm_scale,
                  q_data_type=dtype,
-                 kv_data_type=kv_cache_dtype,
+                 kv_data_type=dtype,
                  logits_soft_cap=soft_cap)
 
-    output = torch.empty(query.shape, dtype=dtype)
-    wrapper.run(query,
-                key_value_cache,
-                k_scale=k_scale,
-                v_scale=v_scale,
-                out=output)
+    output = torch.empty(ref_query.shape, dtype=dtype)
+    wrapper.run(ref_query, ref_kv_cache, out=output)
+    o_scale = 1.0
+    if o_quant_dtype == FP8_DTYPE:
+        _, o_scale = to_float8(output)
 
-    # TRTLLM Decode
-    output_trtllm = torch.empty(query.shape, dtype=dtype)
+    # TRTLLM Prefill
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
     flashinfer.prefill.trtllm_batch_context_with_kv_cache(
-        query=query.contiguous(),
-        kv_cache=key_value_cache,
+        query=query,
+        kv_cache=kv_cache,
         workspace_buffer=workspace_buffer,
         block_tables=block_tables,
         seq_lens=seq_lens,
         max_q_len=max_q_len,
         max_kv_len=max_seq_len,
-        bmm1_scale=k_scale * scale,
-        bmm2_scale=v_scale,
-        batch_size=num_seqs,
+        bmm1_scale=q_scale * k_scale * sm_scale,
+        bmm2_scale=v_scale / o_scale,
+        batch_size=batch_size,
         cum_seq_lens_q=q_indptr,
         cum_seq_lens_kv=kv_indptr,
         out=output_trtllm,
     )
+    if o_quant_dtype == FP8_DTYPE:
+        output_trtllm = output_trtllm.to(dtype) * o_scale
+
+    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
+        rtol, atol = 5e-2, 7e-2
+    else:
+        rtol, atol = 1e-2, 1e-2
 
-    torch.testing.assert_close(output, output_trtllm, atol=1e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, output_trtllm, atol=atol, rtol=rtol), \
         f"{torch.max(torch.abs(output - output_trtllm))}"
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 0e87fa3f23e3..04ab100c8775 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -128,11 +128,17 @@ def __init__(
         self._q_scale = torch.tensor(1.0, dtype=torch.float32)
         self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
 
-        # We also keep the float32 versions of k/v_scale for attention
-        # backends that don't support tensors (Flashinfer)
+        # We also keep q/k/v_scale on host (cpu) memory for attention
+        # backends that require the scales to be on host instead of on device.
+        # e.g. Flashinfer
+        self._q_scale_float = 1.0
         self._k_scale_float = 1.0
         self._v_scale_float = 1.0
 
+        # The output scale on host memory. This should be the input scale of
+        # the quant op after this attention layer.
+        self._o_scale_float: Optional[float] = None
+
         self.use_mla = use_mla
         self.num_heads = num_heads
         self.head_size = head_size
@@ -291,6 +297,7 @@ def calc_kv_scales(self, query, key, value):
         self._q_scale.copy_(torch.abs(query).max() / self.q_range)
         self._k_scale.copy_(torch.abs(key).max() / self.k_range)
         self._v_scale.copy_(torch.abs(value).max() / self.v_range)
+        self._q_scale_float = self._q_scale.item()
         self._k_scale_float = self._k_scale.item()
         self._v_scale_float = self._v_scale.item()
         # We only calculate the scales once
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index a40a8caf34a8..1f77a2667613 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -9,7 +9,7 @@
                                            unset_fake_temporarily)
 
 from vllm.attention import Attention
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -18,23 +18,32 @@
 
 logger = init_logger(__name__)
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 ATTN_OP = torch.ops.vllm.unified_attention_with_output.default
 RESHAPE_OP = torch.ops.aten.reshape.default
 
 
 class AttentionStaticQuantPattern:
+    """
+    Fusion for Attention+StaticQuant.
+
+    Only triggers when the attention implementation returns True in
+    `fused_output_quant_supported()`. If the pattern is found, the StaticQuant
+    op will be removed from the graph, and its scale will be passed into
+    Attention op as the `output_scale` argument.
+    """
 
     def __init__(
         self,
-        layer_name: str,
-        num_heads: int,
-        head_size: int,
+        layer: Attention,
         quant_dtype: torch.dtype,
         symmetric=True,
     ):
-        self.layer_name = layer_name
-        self.num_heads = num_heads
-        self.head_size = head_size
+        self.layer = layer
+        self.layer_name = layer.layer_name
+        self.num_heads = layer.num_heads
+        self.head_size = layer.head_size
         self.quant_dtype = quant_dtype
         self.quant_key = QuantKey(dtype=quant_dtype,
                                   static=True,
@@ -48,11 +57,10 @@ def empty_quant(self, *args, **kwargs):
         kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
         return torch.empty(*args, **kwargs)
 
-    def register_if_supported(self, pm_pass: PatternMatcherPass,
-                              layer: Attention):
-        if layer.impl.fused_output_quant_supported(self.quant_dtype,
-                                                   self.quant_key.static,
-                                                   self.quant_key.group_shape):
+    def register_if_supported(self, pm_pass: PatternMatcherPass):
+        if self.layer.impl.fused_output_quant_supported(
+                self.quant_dtype, self.quant_key.static,
+                self.quant_key.group_shape):
             self._register(pm_pass)
 
     def _register(self, pm_pass: PatternMatcherPass):
@@ -60,19 +68,15 @@ def _register(self, pm_pass: PatternMatcherPass):
         def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                     output_attn: torch.Tensor, output_quant: torch.Tensor,
                     scale: torch.Tensor):
-            view_7 = RESHAPE_OP(output_attn,
-                                [-1, self.num_heads, self.head_size])
-
             at1 = auto_functionalized(ATTN_OP,
                                       query=q,
                                       key=k,
                                       value=v,
-                                      output=view_7,
+                                      output=output_attn,
                                       layer_name=self.layer_name,
                                       output_scale=None)
             attn_out_view = RESHAPE_OP(at1[1],
                                        [-1, self.num_heads * self.head_size])
-
             at2 = auto_functionalized(self.QUANT_OP,
                                       result=output_quant,
                                       input=attn_out_view,
@@ -82,17 +86,19 @@ def pattern(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
         def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                         output_attn: torch.Tensor, output_quant: torch.Tensor,
                         scale: torch.Tensor):
-            view_7 = RESHAPE_OP(output_quant,
-                                [-1, self.num_heads, self.head_size])
-
+            # attn output in quant_dtype
+            output_attn = torch.ops.aten.full.default(
+                [q.shape[0], self.num_heads, self.head_size],
+                0.0,
+                dtype=self.quant_dtype,
+                device=q.device)
             at1 = auto_functionalized(ATTN_OP,
                                       query=q,
                                       key=k,
                                       value=v,
-                                      output=view_7,
+                                      output=output_attn,
                                       layer_name=self.layer_name,
                                       output_scale=scale)
-
             return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
 
         # Need custom fake mode, otherwise tracing happens with real tensors.
@@ -102,7 +108,7 @@ def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                 empty_bf16(5, self.num_heads, self.head_size),  # q
                 empty_bf16(5, self.num_heads, self.head_size),  # k
                 empty_bf16(5, self.num_heads, self.head_size),  # v
-                empty_bf16(5, self.num_heads * self.head_size),  # attn_output
+                empty_bf16(5, self.num_heads, self.head_size),  # attn_output
                 self.empty_quant(5, self.num_heads *
                                  self.head_size),  # quant_output
                 empty_fp32(1, 1)  # scale
@@ -140,27 +146,30 @@ class AttnFusionPass(VllmInductorPass):
 
     def __init__(self, config: VllmConfig):
         super().__init__(config)
-        self.static_fwd_ctx = config.compilation_config.static_forward_context
 
         self.patterns = PatternMatcherPass(pass_name="attn_fusion_pass")
 
-        for key, layer in self.static_fwd_ctx.items():
-            pattern = AttentionStaticQuantPattern(key, layer.num_heads,
-                                                  layer.head_size,
-                                                  current_platform.fp8_dtype())
-            pattern.register_if_supported(self.patterns, layer)
-        if len(self.static_fwd_ctx) == 0:
+        attn_layers = get_layers_from_vllm_config(config, Attention)
+        for layer_name, layer in attn_layers.items():
+            pattern = AttentionStaticQuantPattern(layer, FP8_DTYPE)
+            pattern.register_if_supported(self.patterns)
+        if len(attn_layers) == 0:
             logger.warning(
-                "Attention + quant fusion is enabled, but "
-                "CompilationConfig.static_forward_context is empty. "
-                "Cannot access attention layers so no fusion "
-                "patterns were registered.")
+                "Attention + quant fusion is enabled, but no attention layers "
+                "were found in CompilationConfig.static_forward_context "
+                "so no fusion patterns were registered.")
 
     def __call__(self, graph: torch.fx.graph.Graph) -> None:
         self.begin()
         self.dump_graph(graph, "before_attn_fusion")
 
         count = self.patterns.apply(graph)
+
+        # TODO: Move this to pass_manager.py after the fx graph broken issue
+        # has been resolved.
+        # see https://github.com/vllm-project/vllm/issues/23091
+        graph.eliminate_dead_code()
+
         logger.debug("Fused quantization onto %s attention nodes", count)
         self.dump_graph(graph, "after_attn_fusion")
         self.end_and_log()
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 2e31b7bad747..996be1265667 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -174,21 +174,30 @@ def supports_trtllm_attention() -> tuple[bool, Optional[str]]:
 
 
 def use_trtllm_attention(
+    num_qo_heads: int,
+    num_kv_heads: int,
     num_tokens: int,
     max_seq_len: int,
     kv_cache_dtype: str,
-    num_qo_heads: Optional[int],
-    num_kv_heads: Optional[int],
-    attn_head_size: Optional[int],
+    q_dtype: torch.dtype,
+    is_prefill: bool,
     has_sinks: bool = False,
 ) -> bool:
     use_trtllm, env_value = supports_trtllm_attention()
     if not use_trtllm:
         return False
 
-    # Check if the dimensions are supported by TRTLLM decode attention
-    if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None
-            or num_qo_heads % num_kv_heads != 0):
+    if num_qo_heads % num_kv_heads != 0:
+        return False
+
+    # Must use TRTLLM attention if query is FP8 quantized
+    if q_dtype == current_platform.fp8_dtype():
+        logger.info_once("Using TRTLLM attention (query is quantized).")
+        return True
+
+    # TRTLLM prefill attention does not support FP8 kv cache with
+    # non-quantized query
+    if is_prefill and kv_cache_dtype.startswith("fp8"):
         return False
 
     # If sinks are being used, we must use TRTLLM attention as it's
@@ -290,6 +299,7 @@ def flashinfer_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
     "has_flashinfer_moe",
     "has_flashinfer_cutlass_fused_moe",
     "has_nvidia_artifactory",
+    "supports_trtllm_attention",
     "use_trtllm_attention",
     "flashinfer_scaled_fp4_mm",
 ]
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 991904229fd7..c56e721dff8c 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -15,12 +15,17 @@
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 
 import vllm.envs as envs
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.platforms import current_platform
 from vllm.utils import cdiv, is_pin_memory_available
-from vllm.utils.flashinfer import use_trtllm_attention
+from vllm.utils.flashinfer import (supports_trtllm_attention,
+                                   use_trtllm_attention)
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -35,6 +40,8 @@
 
 FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 logger = init_logger(__name__)
 
 
@@ -519,22 +526,27 @@ def build(self,
         else:
             kv_cache_dtype = self.kv_cache_spec.dtype
 
-        num_qo_heads = self.vllm_config.model_config.get_num_attention_heads(
-            self.vllm_config.parallel_config)
+        config = self.vllm_config
+        num_qo_heads = config.model_config.get_num_attention_heads(
+            config.parallel_config)
         num_kv_heads = self.kv_cache_spec.num_kv_heads
         head_dim = self.kv_cache_spec.head_size
 
         # Check if any layer uses sinks (requires TRTLLM attention)
         has_sinks = self.global_hyperparameters.has_sinks
 
-        # currently prefill trtllm attention does not support fp8 kv cache
-        prefill_use_trtllm = not cache_dtype.startswith("fp8") \
-                                and use_trtllm_attention(
-                                num_prefill_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim, has_sinks)
+        # Insert FP8 quant for query if FP8 kv cache and attn fusion enabled
+        q_dtype = config.model_config.dtype
+        enable_fusion = config.compilation_config.pass_config.enable_attn_fusion
+        if cache_dtype.startswith("fp8") and enable_fusion:
+            q_dtype = kv_cache_dtype
+
+        prefill_use_trtllm = use_trtllm_attention(
+            num_qo_heads, num_kv_heads, num_prefill_tokens, max_seq_len,
+            cache_dtype, q_dtype, is_prefill=True, has_sinks=has_sinks)
         decode_use_trtllm = use_trtllm_attention(
-                                num_decode_tokens, max_seq_len, cache_dtype,
-                                num_qo_heads, num_kv_heads, head_dim, has_sinks)
+            num_qo_heads, num_kv_heads, num_decode_tokens, max_seq_len,
+            cache_dtype, q_dtype, is_prefill=False, has_sinks=has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -548,7 +560,7 @@ def build(self,
             head_dim=head_dim,
             page_size=page_size,
             kv_data_type=kv_cache_dtype,
-            q_data_type=self.vllm_config.model_config.dtype,
+            q_data_type=q_dtype,
             slot_mapping=common_attn_metadata.slot_mapping,
             max_q_len=max_q_len,
             max_seq_len=max_seq_len,
@@ -622,6 +634,8 @@ def __init__(
             self.sliding_window = (-1, -1)
         else:
             self.sliding_window = (sliding_window - 1, 0)
+        self.window_left = (self.sliding_window[0]
+                            if self.sliding_window is not None else -1)
         self.kv_cache_dtype = kv_cache_dtype
         self.logits_soft_cap = logits_soft_cap
         self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
@@ -644,6 +658,19 @@ def __init__(
                 )
             self.sinks = sinks
 
+        self.support_trtllm_attn = (supports_trtllm_attention() and
+                                    num_heads % num_kv_heads == 0)
+        self.bmm1_scale: Optional[float] = None
+        self.bmm2_scale: Optional[float] = None
+
+    def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
+                                     group_shape: GroupShape):
+        supported_quant_type = (dtype == FP8_DTYPE and static and
+                                group_shape == GroupShape.PER_TENSOR)
+        return (self.support_trtllm_attn
+                and self.kv_cache_dtype.startswith("fp8")
+                and supported_quant_type)
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -672,15 +699,42 @@ def forward(
         """
         assert output is not None, "Output tensor must be provided."
 
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlashInferImpl")
-
         if attn_metadata is None:
             # Profiling run.
             return output
 
+        if self.bmm1_scale is None:
+            self.bmm1_scale = (layer._q_scale_float * layer._k_scale_float *
+                               self.scale)
+
+        if self.bmm2_scale is None:
+            self.bmm2_scale = layer._v_scale_float
+
+        # The attn+quant fusion happens when output_scale is provided.
+        if output_scale is None:
+            assert attn_metadata.q_data_type != FP8_DTYPE, \
+                "Query can only be FP8 if output fusion happened."
+        else:
+            assert attn_metadata.q_data_type == FP8_DTYPE, \
+                "Query must be FP8 when attn+quant fusion happened."
+            assert (attn_metadata.prefill_use_trtllm and
+                    attn_metadata.decode_use_trtllm), "Must use TRT-LLM attn"
+            assert output.dtype == FP8_DTYPE, \
+                "Output must be FP8 when attn+quant fusion happened."
+
+            # TRTLLM attn kernel requires o scale as a host scalar, store the
+            # o scale to host scalar in warmup run with cuda graph not enabled
+            if layer._o_scale_float is None:
+                layer._o_scale_float = output_scale.cpu().item()
+                self.bmm2_scale = self.bmm2_scale / layer._o_scale_float
+
+            # Insert FP8 quant for query
+            num_tokens, num_heads, head_size = query.shape
+            query, _ = ops.scaled_fp8_quant(
+                query.reshape((num_tokens, num_heads * head_size)).contiguous(),
+                layer._q_scale)
+            query = query.reshape((num_tokens, num_heads, head_size))
+
         # IMPORTANT!
         # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
         # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
@@ -718,9 +772,6 @@ def forward(
                     self.kv_cache_dtype)
                 kv_cache = kv_cache.view(torch_dtype)
 
-        window_left = (self.sliding_window[0]
-                       if self.sliding_window is not None else -1)
-
         # Inputs and outputs may be padded for CUDA graphs
         query = query[:num_actual_tokens]
         output_padded = output
@@ -748,7 +799,7 @@ def forward(
 
             if not attn_metadata.prefill_use_trtllm:
                 assert prefill_wrapper._causal
-                assert prefill_wrapper._window_left == window_left
+                assert prefill_wrapper._window_left == self.window_left
                 assert prefill_wrapper._logits_soft_cap == (
                     self.logits_soft_cap or 0.0)
                 assert prefill_wrapper._sm_scale == self.scale
@@ -783,12 +834,12 @@ def forward(
                     seq_lens=seq_lens_prefill,
                     max_q_len=attn_metadata.max_q_len,
                     max_kv_len=attn_metadata.max_seq_len,
-                    bmm1_scale=layer._k_scale_float * self.scale,
-                    bmm2_scale=layer._v_scale_float,
+                    bmm1_scale=self.bmm1_scale,
+                    bmm2_scale=self.bmm2_scale,
                     batch_size=attn_metadata.num_prefills,
                     cum_seq_lens_q=attn_metadata.qo_indptr_gpu,
                     cum_seq_lens_kv=attn_metadata.paged_kv_indptr_gpu,
-                    window_left=window_left,
+                    window_left=self.window_left,
                     sinks=self.sinks,
                     out=output[num_decode_tokens:],
                 )
@@ -800,7 +851,7 @@ def forward(
             assert decode_wrapper is not None
 
             if not attn_metadata.decode_use_trtllm:
-                assert decode_wrapper._window_left == window_left
+                assert decode_wrapper._window_left == self.window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap
                                                            or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
@@ -815,8 +866,8 @@ def forward(
                 # decode_query may be non-contiguous
                 decode_query = decode_query.contiguous()
                 workspace_buffer = decode_wrapper._float_workspace_buffer
-                block_tables_decode = attn_metadata.block_table_tensor[:
-                                                                       num_decode_tokens]
+                block_tables_decode = attn_metadata.\
+                        block_table_tensor[:num_decode_tokens]
                 seq_lens_decode = attn_metadata.seq_lens[:num_decode_tokens]
 
                 # This path needs to be enabled with VLLM_KV_CACHE_LAYOUT = HND
@@ -834,9 +885,9 @@ def forward(
                     block_tables=block_tables_decode,
                     seq_lens=seq_lens_decode,
                     max_seq_len=attn_metadata.max_seq_len,
-                    bmm1_scale=layer._k_scale_float * self.scale,
-                    bmm2_scale=layer._v_scale_float,
-                    window_left=window_left,
+                    bmm1_scale=self.bmm1_scale,
+                    bmm2_scale=self.bmm2_scale,
+                    window_left=self.window_left,
                     sinks=self.sinks,
                     out=output[:num_decode_tokens],
                 )

From c90e5ecf4067283d0cf617994eca532450ba3fa4 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 19 Aug 2025 20:54:30 +0800
Subject: [PATCH 192/233] [Model] Add multi_label_classification support
 (#23173)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/conftest.py                             | 10 +++++-
 .../test_multilabel_classification_support.py | 33 +++++++++++++++++++
 vllm/model_executor/layers/pooler.py          | 15 +++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 tests/models/language/pooling/test_multilabel_classification_support.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 3f3790cab8d3..2bf88abb0f6c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -456,7 +456,15 @@ def classify(self, prompts: list[str]) -> list[str]:
         outputs = []
         for inputs in all_inputs:
             output = self.model(**self.wrap_device(inputs))
-            logits = output.logits.softmax(dim=-1)[0].tolist()
+
+            problem_type = getattr(self.config, "problem_type", "")
+
+            if problem_type == "regression":
+                logits = output.logits[0].tolist()
+            elif problem_type == "multi_label_classification":
+                logits = output.logits.sigmoid()[0].tolist()
+            else:
+                logits = output.logits.softmax(dim=-1)[0].tolist()
             outputs.append(logits)
 
         return outputs
diff --git a/tests/models/language/pooling/test_multilabel_classification_support.py b/tests/models/language/pooling/test_multilabel_classification_support.py
new file mode 100644
index 000000000000..45366f209414
--- /dev/null
+++ b/tests/models/language/pooling/test_multilabel_classification_support.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModelForSequenceClassification
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["Rami/multi-label-class-classification-on-github-issues"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_classify_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index e2162e5cbf95..75e65072b701 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -172,6 +172,15 @@ def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
 
 
 def get_classification_activation_function(config: PretrainedConfig):
+    # Implement alignment with transformers ForSequenceClassificationLoss
+    # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92
+    problem_type = getattr(config, "problem_type", "")
+    if problem_type == "regression":
+        return PoolerIdentity()
+    if problem_type == "single_label_classification":
+        return PoolerClassify()
+    if problem_type == "multi_label_classification":
+        return PoolerMultiLabelClassify()
     return PoolerClassify()
 
 
@@ -409,6 +418,12 @@ def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
         return x.to(pooled_data.dtype)
 
 
+class PoolerMultiLabelClassify(PoolerActivation):
+
+    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
+        return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
+
+
 class PoolerClassify(PoolerActivation):
 
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:

From 33fe9b643e95653e33058fdba0bd06efa6e6c9dd Mon Sep 17 00:00:00 2001
From: myselvess <244285088@qq.com>
Date: Tue, 19 Aug 2025 21:12:59 +0800
Subject: [PATCH 193/233] [Model] support new model ovis2.5 (#23084)

Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |   1 +
 examples/offline_inference/vision_language.py |  33 +
 .../vision_language_multi_image.py            |  31 +
 .../multimodal/generation/test_common.py      |  21 +
 .../generation/vlm_utils/model_utils.py       |  58 ++
 .../multimodal/processing/test_common.py      |   2 +
 tests/models/registry.py                      |   3 +
 vllm/model_executor/models/ovis2_5.py         | 570 ++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/siglip2navit.py    | 607 ++++++++++++++++++
 .../transformers_utils/processors/__init__.py |   3 +-
 vllm/transformers_utils/processors/ovis2_5.py | 458 +++++++++++++
 12 files changed, 1787 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/ovis2_5.py
 create mode 100644 vllm/model_executor/models/siglip2navit.py
 create mode 100644 vllm/transformers_utils/processors/ovis2_5.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index bfab5713c742..1d165fa6f16b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -641,6 +641,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ |
+| `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | ✅︎ |
 | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + I<sup>E</sup> | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ |
 | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + I<sup>E+</sup> | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ |
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 9f6028d87cb2..88bbbfdfbd18 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -1105,6 +1105,38 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Ovis2_5
+def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2.5-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={modality: 1},
+    )
+    if modality == "image":
+        placeholder = "<image>"
+    elif modality == "video":
+        placeholder = "<video>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # PaliGemma
 def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1579,6 +1611,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     "nemotron_vl": run_nemotron_vl,
     "NVLM_D": run_nvlm_d,
     "ovis": run_ovis,
+    "ovis2_5": run_ovis2_5,
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 56519c95f822..eabd9453f3c5 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -680,6 +680,36 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# ovis2_5
+def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2.5-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
 
@@ -1155,6 +1185,7 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "ovis": load_ovis,
+    "ovis2_5": load_ovis2_5,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
     "phi4_multimodal": load_phi4_multimodal,
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2919bdbe91bb..ea5de9d9f5c5 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -11,6 +11,7 @@
 import pytest
 from transformers import (AutoModel, AutoModelForImageTextToText,
                           AutoModelForTextToWaveform, AutoModelForVision2Seq)
+from transformers.utils import is_flash_attn_2_available
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -621,6 +622,26 @@
         hf_model_kwargs={"llm_attn_implementation": "sdpa"},
         patch_hf_runner=model_utils.ovis_patch_hf_runner,
     ),
+    "ovis2_5": VLMTestInfo(
+        models=["AIDC-AI/Ovis2.5-2B"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<video>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="half",
+        num_logprobs=10,
+        patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
+        marks=[pytest.mark.skipif(
+            not is_flash_attn_2_available(),
+            reason="HF model needs `flash_attn` installed"
+        )],
+    ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index e43db4937e46..8b7d051218f1 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import numpy.typing as npt
+import PIL.Image
 import pytest
 import regex as re
 import torch
@@ -810,6 +811,63 @@ def processor(*args, text="", images=None, **kwargs):
     return hf_model
 
 
+def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Ovis2."""
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.llm.get_output_embeddings()
+
+    def processor(*args, text="", images=None, videos=None, **kwargs):
+        if images is None:
+            images = []
+        else:
+            images = [images] if isinstance(images, Image) else images
+        if videos is None:
+            videos = []
+        else:
+            videos = [videos] if isinstance(videos, np.ndarray) else videos
+            videos = [[PIL.Image.fromarray(frame) for frame in vid]
+                      for vid in videos]
+
+        prompt_start_and_end = {
+            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "llama":
+            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
+        }
+        for start, end in prompt_start_and_end.values():
+            if start in text and end in text:
+                text = text.split(start)[1].split(end)[0]
+                break
+
+        images_message = [{"type": "image", "image": img} for img in images]
+        videos_message = [{"type": "video", "video": vid} for vid in videos]
+
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                *images_message,
+                *videos_message,
+                {
+                    "type": "text",
+                    "text": text
+                },
+            ],
+        }]
+
+        input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
+            messages=messages, enable_thinking=True)
+        inputs = {
+            "inputs": input_ids,
+            "pixel_values": pixel_values,
+            "grid_thws": grid_thws,
+        }
+        return BatchFeature(data=inputs, tensor_type="pt")
+
+    hf_model.processor = processor
+    return hf_model
+
+
 def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner for Qwen2.5-Omni."""
     thinker = hf_model.model.thinker
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a1744317b394..0fdc182b9ee9 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -162,6 +162,7 @@ def _test_processing_correctness(
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
     "mllama": False,
     "ovis": False,
+    "ovis2_5": False,
     "paligemma": False,
     "ultravox": False,
     "whisper": False,
@@ -301,6 +302,7 @@ def _test_processing_correctness_one(
     "AIDC-AI/Ovis1.6-Gemma2-9B",
     "AIDC-AI/Ovis1.6-Llama3.2-3B",
     "AIDC-AI/Ovis2-1B",
+    "AIDC-AI/Ovis2.5-2B",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
     "microsoft/Phi-3.5-vision-instruct",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 99cf997790fe..cbdc9edbbc9d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -464,6 +464,9 @@ def check_available_online(
                             transformers_version_reason="HF model is not compatible",  # noqa: E501
                             extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
+    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True,
+                            max_transformers_version="4.53",
+                            transformers_version_reason="HF model is not compatible"),  # noqa: E501
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py
new file mode 100644
index 000000000000..aa4ea3dd48f6
--- /dev/null
+++ b/vllm/model_executor/models/ovis2_5.py
@@ -0,0 +1,570 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+""" PyTorch Ovis model."""
+from collections.abc import Iterable, Mapping
+from functools import partial
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.models.ovis import (OvisImagePatchInputs,
+                                             VisualEmbedding)
+from vllm.model_executor.models.siglip2navit import Siglip2NavitModel
+from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn,
+                                              init_vllm_registered_model,
+                                              maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal
+
+IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
+INDICATOR_IDS = [-301, -302, -303, -304]
+
+IMAGE_PAD_TOKEN_MAP = {
+    "gemma2": "<unused0>",
+    "llama": "<|reserved_special_token_0|>",
+    "qwen2": "<|image_pad|>",
+    "qwen3": "<|image_pad|>",
+}
+IMAGE_PAD_TOKEN_ID_MAP = {
+    "gemma2": 7,
+    "llama": 128002,
+    "qwen2": 151655,
+    "qwen3": 151655,
+}
+
+
+def _ovis2_5_field_config():
+    return dict(pixel_values=MultiModalFieldConfig.batched("image"),
+                grids=MultiModalFieldConfig.batched("image"),
+                indicator_tokens=MultiModalFieldConfig.batched("image"),
+                video_pixel_values=MultiModalFieldConfig.batched("video"),
+                video_indicator_tokens=MultiModalFieldConfig.batched("video"),
+                video_grids=MultiModalFieldConfig.batched("video"))
+
+
+class VisualTokenizer(torch.nn.Module):
+    """
+    VIT
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        visual_vocab_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vit = self._init_backbone(
+            config=config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.vit",
+        )
+        # reserved tokens for INDICATOR_IDS
+        head_dim = visual_vocab_size - len(INDICATOR_IDS)
+        self.head = torch.nn.Sequential(
+            ReplicatedLinear(
+                self.config.hidden_size * self.config.hidden_stride**2,
+                head_dim,
+                bias=False,
+                return_bias=False,
+            ), torch.nn.LayerNorm(head_dim))
+
+    def _init_backbone(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        model_type = config.model_type
+        if model_type == "siglip2_navit":
+            return Siglip2NavitModel(config=config, )
+        raise ValueError(
+            f"Unsupported visual tokenizer model_type: {model_type}")
+
+    @property
+    def dtype(self):
+        return next(self.head.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.head.parameters()).device
+
+    def tokenize(self, logits):
+        tokens = torch.softmax(logits, dim=-1,
+                               dtype=torch.float32).to(logits.dtype)
+        return tokens
+
+    def encode(self, pixel_values, grid_thws):
+        features = self.vit(pixel_values,
+                            grid_thws,
+                            output_hidden_states=True,
+                            return_dict=True)
+        # refer to qwen2.5-vl patchmerger
+        seq_len, _ = features.shape
+        features = features.reshape(seq_len // (self.config.hidden_stride**2),
+                                    -1)
+
+        return features
+
+    def forward(self, pixel_values, grid_thws) -> torch.Tensor:
+        features = self.encode(pixel_values, grid_thws)
+        logits = self.head(features)
+        tokens = self.tokenize(logits)
+        # tokens' shape is [#Token, VocabSize-4],
+        # so padding with [#Token, 4], after which,
+        # tokens' shape should become [#Token, VocabSize];
+        tokens = torch.nn.functional.pad(
+            tokens,
+            (0, len(INDICATOR_IDS)),
+            mode="constant",
+            value=0,
+        )
+        return tokens
+
+
+class Ovis2_5ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self, **kwargs):
+        vit_config = self.get_hf_config().vit_config
+        return self.ctx.get_hf_processor(
+            Ovis2_5Processor,
+            image_pad_token=self.get_image_pad_token(),
+            patch_size=vit_config.patch_size,
+            hidden_stride=vit_config.hidden_stride,
+            temporal_patch_size=vit_config.temporal_patch_size,
+        )
+
+    def get_image_pad_token(self) -> str:
+        hf_text_config = self.get_hf_config().get_text_config()
+        text_model_type = hf_text_config.model_type
+        return IMAGE_PAD_TOKEN_MAP.get(text_model_type)
+
+    def get_image_processor(self) -> BaseImageProcessor:
+        return self.get_hf_processor().image_processor  # type: ignore
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": 1}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # NOTE(myselvess): max_pixels 1792 * 1792 hardcoded in original code
+        # TODO(myselvess): Be adjusted based on the max_pixels
+        return ImageSize(width=1792, height=1792)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+    ) -> tuple[ImageSize, int]:
+        hf_config = self.get_hf_config()
+        vit_config = hf_config.vit_config
+        patch_size = vit_config.patch_size
+        temporal_patch_size = vit_config.temporal_patch_size
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + (-num_frames % temporal_patch_size)
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
+        grid_h = image_height // patch_size
+        grid_w = image_width // patch_size
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches
+        return num_vision_tokens
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(image_width=target_width,
+                                         image_height=target_height)
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        num_frames = 0
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+            if next_max_tokens > max_tokens:
+                break
+            num_frames = next_num_frames
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+        return max(max_frames_per_video, 1)
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor: Optional[BaseImageProcessor],
+    ) -> int:
+        num_video_tokens = self.get_num_image_tokens(image_width=image_width,
+                                                     image_height=image_height,
+                                                     num_frames=num_frames)
+        return num_video_tokens
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+            image_processor=None,
+        )
+
+
+class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        return IMAGE_TOKEN * num_images + VIDEO_TOKEN * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=target_num_frames,
+                num_videos=num_videos,
+            )
+        }
+        return mm_data
+
+
+class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo]
+                                 ):
+
+    def visual_indicators_to_visual_tokens(
+        self,
+        visual_indicators: list[int],
+    ) -> list[int]:
+        """
+        Filter image indicators placeholders and convert them to corresponding 
+        tokens in visual tokenizer.
+        """
+        hf_config = self.info.get_hf_config()
+        vte_vocab_size = hf_config.visual_vocab_size
+        return [
+            vte_vocab_size - len(INDICATOR_IDS) + abs(x + 300) - 1
+            for x in visual_indicators if x < -300
+        ]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if not mm_data:
+            # Avoid warning from HF logger for text-only input
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
+        )
+        hf_processor = self.info.get_hf_processor()
+
+        if "videos" in mm_data:
+            visual_indicators = [
+                hf_processor.construct_visual_indicators((1, 1, 1), True)
+                for grid in processed_outputs["video_grids"]
+            ]
+            indicator_tokens = [
+                self.visual_indicators_to_visual_tokens(indicator)
+                for indicator in visual_indicators
+            ]
+            processed_outputs["video_indicator_tokens"] = indicator_tokens
+        if "images" in mm_data:
+            visual_indicators = [
+                hf_processor.construct_visual_indicators((1, 1, 1), False)
+                for grid in processed_outputs["grids"]
+            ]
+            indicator_tokens = [
+                self.visual_indicators_to_visual_tokens(indicator)
+                for indicator in visual_indicators
+            ]
+
+            processed_outputs["indicator_tokens"] = indicator_tokens
+        return processed_outputs
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+
+        return prompt_tokens
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _ovis2_5_field_config()
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> list[PromptReplacement]:
+
+        def get_replacement_ovis(item_idx, modality: str):
+            if modality == "image":
+                out_item = out_mm_kwargs["image"][item_idx]
+                grid = out_item["grids"].data
+            elif modality == "video":
+                out_item = out_mm_kwargs["video"][item_idx]
+                grid = out_item["video_grids"].data
+            hf_processor = self.info.get_hf_processor()
+            return hf_processor.construct_visual_placeholders(grid[0], )
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=IMAGE_TOKEN if modality == "image" else VIDEO_TOKEN,
+                replacement=partial(get_replacement_ovis, modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(Ovis2_5MultiModalProcessor,
+                                        info=Ovis2_5ProcessingInfo,
+                                        dummy_inputs=Ovis2_5DummyInputsBuilder)
+class Ovis2_5(nn.Module, SupportsMultiModal):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.config: PretrainedConfig = config
+        self.llm = init_vllm_registered_model(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "llm"),
+        )
+
+        self.visual_tokenizer = VisualTokenizer(
+            config=config.vit_config,
+            visual_vocab_size=config.visual_vocab_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.visual_tokenizer",
+        )
+
+        self.vte = VisualEmbedding(config.visual_vocab_size,
+                                   config.hidden_size)
+
+        text_model_type = self.config.get_text_config().model_type
+        self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type]
+
+        # TODO(Isotr0py): PP support
+        # self.make_empty_intermediate_tensors = (
+        #    self.language_model.make_empty_intermediate_tensors)
+
+    def _parse_and_validate_visual_input(
+            self, is_video,
+            **kwargs: object) -> Optional[OvisImagePatchInputs]:
+        if is_video:
+            pixel_values = kwargs.pop("video_pixel_values", None)
+            indicator_tokens = kwargs.pop("video_indicator_tokens", None)
+            grids = kwargs.pop("video_grids", None)
+        else:
+            pixel_values = kwargs.pop("pixel_values", None)
+            indicator_tokens = kwargs.pop("indicator_tokens", None)
+            grids = kwargs.pop("grids", None)
+        if pixel_values is None and indicator_tokens is None:
+            return None
+
+        if pixel_values is not None and indicator_tokens is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(indicator_tokens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of indicator_tokens. "
+                                 f"Got type: {type(indicator_tokens)}")
+
+            return OvisImagePatchInputs(
+                type="image_patches",
+                flat_data=flatten_bn(flatten_bn(pixel_values), concat=True),
+                patches_per_image=[
+                    x.shape[0] // (self.config.vit_config.hidden_stride**2)
+                    for x in flatten_bn(pixel_values)
+                ],
+                indicator_tokens=flatten_bn(flatten_bn(indicator_tokens),
+                                            concat=True),
+                grids=flatten_bn(flatten_bn(grids), concat=True),
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+            self, image_input: OvisImagePatchInputs) -> MultiModalEmbeddings:
+        image_patches_flat = image_input["flat_data"]
+        patches_per_image = image_input["patches_per_image"]
+        indicator_tokens = image_input["indicator_tokens"]
+        grid_thws = image_input["grids"]
+
+        indicator_per_image = list(
+            map(lambda x: 2 if x > 1 else x + 2, patches_per_image))
+
+        target_dtype = self.visual_tokenizer.dtype
+        visual_tokens = self.visual_tokenizer(
+            image_patches_flat.to(target_dtype), grid_thws)
+
+        visual_embeds = self.vte(visual_tokens)  # 1:1 numeric eq.
+        indicator_embeds = self.vte(indicator_tokens)
+
+        visual_embeds_per_image = visual_embeds.split(patches_per_image, dim=0)
+        indicator_embeds_per_image = indicator_embeds.split(
+            indicator_per_image)
+
+        vision_embeddings = []
+        for indicator, visual in zip(indicator_embeds_per_image,
+                                     visual_embeds_per_image):
+            vision_embeddings_per_image = []
+            visual = visual.unsqueeze(0)
+            for i in range(visual.shape[0]):
+                vision_embeddings_per_image.append(
+                    torch.cat([indicator[i:i + 1], visual[i]], dim=0))
+            vision_embeddings_per_image.append(indicator[i + 1:])
+            vision_embeddings.append(
+                torch.cat(vision_embeddings_per_image, dim=0))
+        return tuple(vision_embeddings)
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        embeddings = []
+
+        # NOTE: _parse_and_validate_visual_input has side-effects and pops
+        # keys from kwargs. We process images first, then videos.
+        image_input = self._parse_and_validate_visual_input(False, **kwargs)
+        if image_input:
+            embeddings.extend(self._process_image_input(image_input))
+
+        video_input = self._parse_and_validate_visual_input(True, **kwargs)
+        if video_input:
+            embeddings.extend(self._process_image_input(video_input))
+
+        return tuple(embeddings) if embeddings else None
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            tmp = torch.concat(multimodal_embeddings, dim=0)
+            inputs_embeds[input_ids == self.image_pad_token_id] = tmp
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        # up until here we have a inputs_embeds 100% numerical identity
+        # between the OG HF Transformers implementation and ours
+        hidden_states = self.llm(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.llm.compute_logits(hidden_states, sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.llm
\ No newline at end of file
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 109bc1fe5c77..8728684d8e68 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -231,6 +231,7 @@
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "Ovis": ("ovis", "Ovis"),
+    "Ovis2_5": ("ovis2_5", "Ovis2_5"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
new file mode 100644
index 000000000000..10093f92a5c1
--- /dev/null
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -0,0 +1,607 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Implementation of SiglipVisionModel intended to be only used
+within a vision language model."""
+
+from typing import Optional, Union
+
+import torch
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+
+from vllm.platforms import _Backend
+
+from .vision import get_vit_attn_backend
+
+
+class VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen,
+                           device=self.inv_freq.device,
+                           dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class Siglip2VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.image_size = config.image_size
+        self.num_patches = config.num_patches
+        self.preserve_original_pe = config.preserve_original_pe
+        self.hidden_stride = config.hidden_stride
+
+        # siglip2 naflex
+        if self.num_patches > 0:
+            self.patch_embedding = nn.Linear(
+                in_features=config.num_channels * self.patch_size *
+                self.patch_size,
+                out_features=self.embed_dim,
+            )
+            if self.preserve_original_pe:
+                self.position_embedding_size = int(self.num_patches**0.5)
+                self.position_embedding = nn.Embedding(self.num_patches,
+                                                       self.embed_dim)
+
+        else:
+            self.patch_embedding = nn.Conv2d(
+                in_channels=config.num_channels,
+                out_channels=self.embed_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+                padding="valid",
+            )
+            if self.preserve_original_pe:
+                self.num_patches = (self.image_size // self.patch_size)**2
+                self.position_embedding_size = (self.image_size //
+                                                self.patch_size)
+                self.position_embedding = nn.Embedding(self.num_patches,
+                                                       self.embed_dim)
+
+    def forward(self,
+                pixel_values: torch.FloatTensor,
+                grid_thws: Optional[torch.LongTensor] = None) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (
+                    num_patches,
+                    num_channels * temporal_patch_size * patch_size * patch_size
+                )
+            grid_thws: (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+        """
+
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        if isinstance(self.patch_embedding, nn.Linear):
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+        elif isinstance(self.patch_embedding, nn.Conv2d):
+            pixel_values = pixel_values.view(
+                -1, self.config.num_channels * self.config.temporal_patch_size,
+                self.patch_size, self.patch_size)
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype))
+            patch_embeds = patch_embeds.reshape(-1, self.embed_dim)
+
+        if self.preserve_original_pe:
+            assert grid_thws is not None
+            pos_embed_new = torch.zeros_like(patch_embeds)
+            positional_embeddings = self.position_embedding.weight.reshape(
+                self.position_embedding_size, self.position_embedding_size,
+                -1).unsqueeze(0).permute(0, 3, 1, 2)
+            cnt = 0
+            for t, h, w in grid_thws:
+                volume = t * h * w
+                pe = F.interpolate(positional_embeddings,
+                                   size=(h, w),
+                                   mode='bicubic',
+                                   align_corners=False)
+                pe = pe.permute(0, 2, 3, 1).reshape(1, h * w, -1)
+                pe = pe[0].repeat(t, 1)
+                pe = pe.reshape(t, h // self.hidden_stride, self.hidden_stride,
+                                w // self.hidden_stride, self.hidden_stride,
+                                -1)
+                pe = pe.permute(0, 1, 3, 2, 4, 5).reshape(volume, -1)
+                pos_embed_new[cnt:cnt + volume] = pe
+                cnt += volume
+            patch_embeds = patch_embeds + pos_embed_new
+
+        return patch_embeds
+
+
+# copy from flash_attn/layers/rotary.py
+def rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_flash_attn_backend: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+    if is_flash_attn_backend:
+        from flash_attn.layers.rotary import apply_rotary_emb
+        apply_rotary_emb_func = apply_rotary_emb
+    else:
+        apply_rotary_emb_func = apply_rotary_emb_torch
+    q_embed = apply_rotary_emb_func(q.float(), cos.float(),
+                                    sin.float()).type_as(q)
+    k_embed = apply_rotary_emb_func(k.float(), cos.float(),
+                                    sin.float()).type_as(k)
+    return q_embed, k_embed
+
+
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+        self.use_rope = config.use_rope
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
+                _Backend.ROCM_AITER_FA
+        }:
+            self.attn_backend = _Backend.TORCH_SDPA
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+        }
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor,
+                                            torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(seq_length, self.num_heads, self.head_dim)
+        keys = keys.view(seq_length, self.num_heads, self.head_dim)
+        values = values.view(seq_length, self.num_heads, self.head_dim)
+
+        if self.use_rope:
+            cos, sin = position_embeddings
+            queries, keys = apply_rotary_pos_emb(queries.unsqueeze(0),
+                                                 keys.unsqueeze(0), cos, sin,
+                                                 self.is_flash_attn_backend)
+            queries = queries.squeeze(0)
+            keys = keys.squeeze(0)
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        if self.is_flash_attn_backend:
+            if self.attn_backend == _Backend.ROCM_AITER_FA:
+                from aiter import flash_attn_varlen_func
+            else:
+                from flash_attn import flash_attn_varlen_func
+            attn_output = flash_attn_varlen_func(
+                queries, keys, values, cu_seqlens, cu_seqlens, max_seqlen,
+                max_seqlen).reshape(seq_length, -1)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            batch_size = cu_seqlens.shape[0] - 1
+            outputs = []
+            cu = cu_seqlens.tolist()
+            for i in range(batch_size):
+                start_idx = cu[i]
+                end_idx = cu[i + 1]
+
+                # Each sequence is processed independently.
+                q_i = queries[start_idx:end_idx].unsqueeze(0)
+                k_i = keys[start_idx:end_idx].unsqueeze(0)
+                v_i = values[start_idx:end_idx].unsqueeze(0)
+
+                # (1, seq_len, num_heads, head_dim) ->
+                # (1, num_heads, seq_len, head_dim)
+                q_i, k_i, v_i = [x.transpose(1, 2) for x in (q_i, k_i, v_i)]
+
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                # (1, num_heads, seq_len, head_dim) -> (seq_len, embed_dim)
+                output_i = output_i.transpose(1, 2).reshape(-1, self.embed_dim)
+                outputs.append(output_i)
+
+            attn_output = torch.cat(outputs, dim=0)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+
+
+class Siglip2MLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Siglip2EncoderLayer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+
+    def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
+                position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all 
+                attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states,
+                                       cu_seqlens=cu_seqlens,
+                                       position_embeddings=position_embeddings)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` 
+    self attention layers. Each layer is a [`Siglip2EncoderLayer`].
+
+    Args:
+        config: PretrainedConfig
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            Siglip2EncoderLayer(config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+        self.rotary_pos_emb = VisionRotaryEmbedding(
+            config.hidden_size // config.num_attention_heads // 2)
+        self.patch_size = config.patch_size
+        self.hidden_stride = config.hidden_stride
+        self.window_size = config.window_size
+        self.spatial_merge_unit = config.hidden_stride * config.hidden_stride
+        if config.fullatt_block_indexes is None:
+            self.fullatt_block_indexes = None
+        else:
+            self.fullatt_block_indexes = [
+                int(i) for i in config.fullatt_block_indexes.split('|')
+            ]
+
+    # copied from qwen2.5_vl
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.hidden_stride,
+                self.hidden_stride,
+                w // self.hidden_stride,
+                self.hidden_stride,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        # patch (after merge) number in each window
+        vit_merger_window_size = (self.window_size // self.hidden_stride //
+                                  self.patch_size)
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.hidden_stride,  # number of patch after merge
+                grid_w // self.hidden_stride,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(
+                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+
+        return window_index, cu_window_seqlens
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        grid_thws: torch.Tensor,
+        output_hidden_states: bool = False,
+    ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, ...]]]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape
+                `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if
+                you want more control over how to convert `input_ids` indices
+                into associated vectors than the model's internal embedding
+                lookup matrix.
+            grid_thws (`torch.LongTensor`):
+                grid shape (num_patches, 3)
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of
+                a plain tuple.
+        """
+        rotary_pos_emb = self.rot_pos_emb(grid_thws)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thws)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=inputs_embeds.device,
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        seq_len, _ = inputs_embeds.size()
+        inputs_embeds = inputs_embeds.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        inputs_embeds = inputs_embeds[window_index, :, :]
+        inputs_embeds = inputs_embeds.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(
+            grid_thws[:, 1] * grid_thws[:, 2], grid_thws[:, 0]
+        ).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have
+            #    same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852
+            # for more information
+            dtype=grid_thws.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        reverse_indices = torch.argsort(window_index)
+        encoder_states = () if output_hidden_states else None
+
+        hidden_states = inputs_embeds
+        for index, block in enumerate(self.layers):
+            if (not self.fullatt_block_indexes
+                    or index in self.fullatt_block_indexes):
+                cu_seqlens_tmp = cu_seqlens
+            else:
+                cu_seqlens_tmp = cu_window_seqlens
+            hidden_states = block(hidden_states, cu_seqlens_tmp,
+                                  position_embeddings)
+            if output_hidden_states:
+                hidden_states_ = hidden_states.reshape(
+                    seq_len // self.spatial_merge_unit,
+                    self.spatial_merge_unit, -1)
+                encoder_states += (hidden_states_[reverse_indices, :].reshape(
+                    seq_len, -1), )
+        # tokens = self.post_trunk_norm(tokens)
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[reverse_indices, :].reshape(seq_len, -1)
+
+        return hidden_states, encoder_states
+
+
+class Siglip2VisionTransformer(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = \
+            (config._attn_implementation == "flash_attention_2")
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = True,
+    ) -> Union[
+            tuple[torch.Tensor],
+            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
+            BaseModelOutputWithNoAttention,
+    ]:
+        r"""
+        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
+            Tensor containing the spatial dimensions (height, width)
+            of the input images.
+        """
+        hidden_states = self.embeddings(pixel_values, grid_thws)
+
+        last_hidden_state, hidden_states = self.encoder(
+            hidden_states, grid_thws, output_hidden_states)
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            output = (last_hidden_state, )
+            output += (hidden_states, ) if output_hidden_states else ()
+            return output
+
+        return last_hidden_state
+
+
+class Siglip2NavitModel(torch.nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+
+        self.vision_model = Siglip2VisionTransformer(config)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_thws: torch.LongTensor,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[
+            tuple[torch.Tensor],
+            tuple[torch.Tensor, tuple[torch.Tensor, ...]],
+            BaseModelOutputWithNoAttention,
+    ]:
+
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+        if return_dict is None:
+            return_dict = self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            grid_thws=grid_thws,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index eca4d7c884dd..8a1ad226d99f 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -11,5 +11,6 @@
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
 from vllm.transformers_utils.processors.ovis import OvisProcessor
+from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-__all__ = ["DeepseekVLV2Processor", "OvisProcessor"]
+__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
diff --git a/vllm/transformers_utils/processors/ovis2_5.py b/vllm/transformers_utils/processors/ovis2_5.py
new file mode 100644
index 000000000000..d3273257ff8c
--- /dev/null
+++ b/vllm/transformers_utils/processors/ovis2_5.py
@@ -0,0 +1,458 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from functools import cached_property
+from typing import Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import AutoProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (ProcessingKwargs, ProcessorMixin,
+                                           Unpack)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+__all__ = ['Ovis2_5Processor']
+IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
+MIN_PIXELS = 448 * 448
+MAX_PIXELS = 1792 * 1792
+
+
+class Ovis2_5ProcessorKwargs(ProcessingKwargs,
+                             total=False):  # type: ignore[call-arg]
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+            'convert_to_rgb': True,
+            'min_pixels': MIN_PIXELS,
+            'max_pixels': MAX_PIXELS,
+        },
+        "videos_kwargs": {
+            'convert_to_rgb': True,
+            'min_pixels': MIN_PIXELS,
+            'max_pixels': MAX_PIXELS,
+        }
+    }
+
+
+class Ovis2_5Processor(ProcessorMixin):
+    r"""
+    Constructs a Ovis processor which wraps a Ovis image processor
+    and a Qwen2 tokenizer into a single processor.
+    [`OvisProcessor`] offers all the functionalities of 
+    [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. 
+    See the [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`]
+    for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will
+            be used to convert lists of messages in a chat into
+            a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "image_pad_token"]
+
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        image_pad_token=None,
+        patch_size=16,
+        hidden_stride=2,
+        temporal_patch_size=1,
+        **kwargs,
+    ):
+        self.image_token = IMAGE_TOKEN
+        self.video_token = VIDEO_TOKEN
+        self.image_pad_token = "<|image_pad|>"
+
+        self.patch_size = patch_size
+        self.hidden_stride = hidden_stride
+        self.temporal_patch_size = temporal_patch_size
+        super().__init__(image_processor,
+                         tokenizer,
+                         chat_template=chat_template)
+
+    @cached_property
+    def extra_special_tokens(self):
+        image_pad_token_id = self.tokenizer.get_vocab()[self.image_pad_token]
+        extra_special_tokens = {
+            "image_token": -200,
+            "video_token": -201,
+            "visual_atom": -300,
+            "image_start": -301,
+            "image_end": -302,
+            "video_start": -303,
+            "video_end": -304,
+            'image_pad': image_pad_token_id,
+        }
+        return extra_special_tokens
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        videos: Union[np.ndarray, list[ImageInput]] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput],
+                    list[PreTokenizedInput]] = None,
+        **kwargs: Unpack[Ovis2_5ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s)
+        and image(s). This method forwards the `text`and `kwargs` arguments
+        to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text`
+        is not `None` to encode the text. To prepare the vision inputs,
+        this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
+        if `vision_infos` is not `None`.
+            Args:
+                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`,
+                    `list[PIL.Image.Image]`, `list[np.ndarray]`,
+                    `list[torch.Tensor]`):
+                    The image or batch of images to be prepared.
+                    Each image can be a PIL image, NumPy array or PyTorch
+                    tensor. Both channels-first and channels-last formats
+                    are supported.
+                text (`str`, `list[str]`, `list[list[str]]`):
+                    The sequence or batch of sequences to be encoded.
+                    Each sequence can be a string or a list of strings
+                    (pretokenized string). If the sequences are provided as
+                    list of strings (pretokenized), you must set
+                    `is_split_into_words=True` (to lift the ambiguity with
+                    a batch of sequences).
+                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`,
+                    `list[torch.Tensor]`):
+                    The image or batch of videos to be prepared. Each video
+                    can be a 4D NumPy array or PyTorch tensor, or a nested
+                    list of 3D frames. Both channels-first and channels-last
+                    formats are supported.
+                return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                    If set, will return tensors of a particular framework.
+                    Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+            Returns:
+                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+                - **input_ids** -- list of token ids to be fed to a model.
+                  Returned when `text` is not `None`.
+                - **attention_mask** -- list of indices specifying which tokens 
+                  should be attended to by the model (when
+                  `return_attention_mask=True` or if *"attention_mask"* 
+                  is in `self.model_input_names` and if `text` is not `None`).
+                - **pixel_values** -- Pixel values to be fed to a model.
+                  Returned when `images` is not `None`.
+                - **pixel_values_videos** -- Pixel values of videos to be fed to
+                  a model. Returned when `videos` is not `None`.
+                - **image_grid_thw** -- list of image 3D grid in LLM. Returned
+                  when `images` is not `None`.
+                - **video_grid_thw** -- list of video 3D grid in LLM. Returned
+                  when `videos` is not `None`.
+                - **second_per_grid_ts** -- list of video seconds per time grid.
+                  Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Ovis2_5ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Process all images first
+        visual_features = {}
+        output = BatchFeature()
+        if images is not None:
+            processed_images = []
+            image_placeholders_list = []
+            grids = []
+            # Process each image
+            for image in images if isinstance(images, list) else [images]:
+                pixel_values, image_placeholders, grid = (
+                    self.preprocess_multidata(
+                        images=image, **output_kwargs["images_kwargs"]))
+                processed_images.append(pixel_values)
+                image_placeholders_list.append(image_placeholders)
+                grids.append(grid)
+
+            # assign all processed images
+            if processed_images:
+                visual_features["image_placeholders"] = image_placeholders_list
+            output["pixel_values"] = processed_images
+            output["grids"] = grids
+
+        if videos is not None:
+            processed_videos = []
+            videos_placeholders_list = []
+            grids = []
+            # Process each video
+            for video in videos if isinstance(videos, list) else [videos]:
+                pixel_values, video_placeholders, grid = (
+                    self.preprocess_multidata(
+                        video=video, **output_kwargs["videos_kwargs"]))
+                processed_videos.append(pixel_values)
+                videos_placeholders_list.append(video_placeholders)
+                grids.append(grid)
+            # assign all processed videos
+            if processed_videos:
+                visual_features[
+                    "video_placeholders"] = videos_placeholders_list
+            output["video_pixel_values"] = processed_videos
+            output["video_grids"] = grids
+
+        # Process text input
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            tokenized_batched_text = self._tokenize_with_visual_symbol(text)
+            image_token_id = self.get_token_value("image_token")
+            video_token_id = self.get_token_value("video_token")
+            replaced_ids_list = []
+            image_idx = 0
+            video_idx = 0
+            for ids_tensor in tokenized_batched_text:
+                has_image_tokens = (image_token_id in ids_tensor
+                                    and "image_placeholders" in visual_features
+                                    and image_idx < len(
+                                        visual_features["image_placeholders"]))
+                has_video_tokens = (video_token_id in ids_tensor
+                                    and "video_placeholders" in visual_features
+                                    and video_idx < len(
+                                        visual_features["video_placeholders"]))
+                if has_image_tokens or has_video_tokens:
+                    # Convert to list for easier manipulation
+                    ids_list = ids_tensor.tolist()
+                    new_ids = []
+
+                    # Replace placeholders
+                    for token_id in ids_list:
+                        if token_id == image_token_id:
+                            new_ids.extend(
+                                visual_features["image_placeholders"]
+                                [image_idx])
+                            image_idx += 1
+                        elif token_id == video_token_id:
+                            new_ids.extend(
+                                visual_features["video_placeholders"]
+                                [video_idx])
+                            video_idx += 1
+                        else:
+                            new_ids.append(token_id)
+                    # Convert back to tensor
+                    ids_tensor = torch.tensor(new_ids, dtype=torch.long)
+                replaced_ids_list.append(ids_tensor)
+            if replaced_ids_list:
+                replaced_and_tokenized_ids = torch.stack(replaced_ids_list)
+            else:
+                replaced_and_tokenized_ids = torch.tensor([], dtype=torch.long)
+            output["input_ids"] = replaced_and_tokenized_ids
+
+            return output
+        # If only images were provided
+        return BatchFeature(data=visual_features)
+
+    def _tokenize_with_visual_symbol(self,
+                                     text_list: list[str]) -> torch.LongTensor:
+        batch_token_ids = []
+        for text in text_list:
+            token_ids = []
+            video_token_id = self.get_token_value("video_token")
+            image_token_id = self.get_token_value("image_token")
+            video_split_texts = text.split(self.video_token)
+
+            for j, video_segment in enumerate(video_split_texts):
+                image_split_texts = video_segment.split(self.image_token)
+                text_chunks = [
+                    self.tokenizer(chunk, add_special_tokens=False).input_ids
+                    for chunk in image_split_texts
+                ]
+                segment_tokens = []
+                for i, chunk in enumerate(text_chunks):
+                    segment_tokens.extend(chunk)
+                    if i < len(text_chunks) - 1:
+                        segment_tokens.append(image_token_id)
+                token_ids.extend(segment_tokens)
+                if j < len(video_split_texts) - 1:
+                    token_ids.append(video_token_id)
+
+            batch_token_ids.append(token_ids)
+        return torch.tensor(batch_token_ids, dtype=torch.long)
+
+    # Copied from qwen2_vl
+    def smart_resize(self,
+                     height: int,
+                     width: int,
+                     factor: int = 28,
+                     min_pixels: int = MIN_PIXELS,
+                     max_pixels: int = MAX_PIXELS):
+        """Rescales the image so that the following conditions are met:
+        1. Both dimensions (height and width) are divisible by 'factor'.
+        2. The total number of pixels is within the range 
+            ['min_pixels', 'max_pixels'].
+        3. The aspect ratio of the image is maintained as closely as possible.
+        """
+        if height < factor or width < factor:
+            print(f"height:{height} or width:{width} must be "
+                  f"larger than factor:{factor}")
+            if height < width:
+                width = round(factor / height * width)
+                height = factor
+            else:
+                height = round(factor / width * height)
+                width = factor
+
+        elif max(height, width) / min(height, width) > 200:
+            print(f"absolute aspect ratio must be smaller than 200, "
+                  f"got {max(height, width) / min(height, width)}")
+            if height > width:
+                height = 200 * width
+            else:
+                width = 200 * height
+
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+
+    def get_token_value(self, tok):
+        return self.extra_special_tokens[tok]
+
+    def construct_visual_indicators(self, grid, is_video: bool = False):
+        if is_video:
+            start_token = self.get_token_value('video_start')
+            end_token = self.get_token_value('video_end')
+        else:
+            start_token = self.get_token_value('image_start')
+            end_token = self.get_token_value('image_end')
+
+        image_placeholders = [start_token, self.get_token_value('visual_atom')]
+        if grid[0] * grid[1] > 1:
+            for r in range(grid[0]):
+                for c in range(grid[1]):
+                    image_placeholders.append(
+                        self.get_token_value('visual_atom'))
+
+        image_placeholders.append(end_token)
+        return image_placeholders
+
+    def construct_visual_placeholders(self, grid, is_video: bool = False):
+        visual_placeholders = self.construct_visual_indicators((1, 1),
+                                                               is_video)
+
+        image_atom_token_id = self.get_token_value('visual_atom')
+        # Extract the padding token ID from tokenizer
+        image_padding_token_id = self.get_token_value('image_pad')
+
+        num_image_atoms = grid[0] * grid[1] * grid[2]
+        num_image_atoms //= self.hidden_stride**2
+        num_image_atoms //= self.temporal_patch_size
+
+        # Create a new list with padding tokens inserted
+        padded_placeholder_tokens = []
+        for token in visual_placeholders:
+            if token == image_atom_token_id:
+                padded_placeholder_tokens.extend([image_padding_token_id] *
+                                                 num_image_atoms)
+            else:
+                padded_placeholder_tokens.append(image_padding_token_id)
+        return padded_placeholder_tokens
+
+    def preprocess_multidata(
+        self,
+        images: Optional[Union[PIL.Image.Image, list[PIL.Image.Image]]] = None,
+        video: Optional[Union[list[PIL.Image.Image], np.ndarray]] = None,
+        convert_to_rgb: Optional[bool] = True,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        return_tensors: Optional[str] = 'pt',
+    ):
+        is_video = False
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+        elif video is not None:
+            is_video = True
+            # type of vidoe in dummy_mm_data is np.ndarray
+            if isinstance(video, np.ndarray):
+                images = []
+                for i in range(video.shape[0]):
+                    image = PIL.Image.fromarray(video[i].astype(np.uint8))
+                    images.append(image)
+            elif isinstance(video, list):
+                images = video
+        min_pixels = min(max_pixels if max_pixels is not None else MAX_PIXELS,
+                         min_pixels if min_pixels is not None else MIN_PIXELS)
+        images = [
+            image.convert("RGB")
+            if convert_to_rgb and image.mode != 'RGB' else image
+            for image in images
+        ]
+
+        width, height = images[0].size
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            resized_height, resized_width = self.smart_resize(
+                height,
+                width,
+                factor=self.patch_size * self.hidden_stride,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+            new_size = dict(height=resized_height, width=resized_width)
+            image_pt = self.image_processor.preprocess(
+                image, size=new_size, return_tensors="np")['pixel_values'][0]
+
+            processed_images.append(image_pt)
+
+        patches = np.array(processed_images)
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            num_to_pad = self.temporal_patch_size - (patches.shape[0] %
+                                                     self.temporal_patch_size)
+            repeats = np.repeat(patches[-1][np.newaxis], num_to_pad, axis=0)
+            patches = np.concatenate([patches, repeats], axis=0)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h = resized_height // self.patch_size
+        grid_w = resized_width // self.patch_size
+
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // self.hidden_stride,
+            self.hidden_stride,
+            self.patch_size,
+            grid_w // self.hidden_stride,
+            self.hidden_stride,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size *
+            self.patch_size * self.patch_size)
+
+        visual_placeholders = self.construct_visual_placeholders(
+            [grid_t, grid_h, grid_w], is_video)
+        return torch.tensor(
+            flatten_patches), visual_placeholders, torch.tensor(
+                [[grid_t, grid_h, grid_w]])
+
+
+AutoProcessor.register("Ovis2_5Processor", Ovis2_5Processor)

From c316a3f624c83cf70a2b75a827f12c548fc3acdf Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 19 Aug 2025 21:39:40 +0800
Subject: [PATCH 194/233] [Bugfix] Fix benchmark_moe.py  (#23177)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index b4a03665ef10..752c2d008216 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -430,7 +430,6 @@ def benchmark(
                 hidden_size,
                 topk,
                 dtype_str,
-                is_marlin=False,
             )
         else:
             config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]

From fd502c659de974b6e1d5e794c64c894aa64096a8 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Tue, 19 Aug 2025 08:25:57 -0700
Subject: [PATCH 195/233] [FEAT] [Performance] Enable DP for ViT in Qwen2.5VL
 (#22742)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/multimodal/test_utils.py           | 326 ++++++++++++++++++++++-
 vllm/model_executor/layers/linear.py     |   2 +-
 vllm/model_executor/models/qwen2_5_vl.py | 136 +++++++---
 vllm/model_executor/models/qwen2_vl.py   |   2 -
 vllm/multimodal/utils.py                 | 215 +++++++++++++++
 5 files changed, 633 insertions(+), 48 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index ea964a54383c..a028c668c8ab 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import base64
+import math
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
@@ -20,6 +21,8 @@
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions,
+                                   get_load_balance_assignment,
+                                   run_dp_sharded_mrope_vision_model,
                                    run_dp_sharded_vision_model)
 from vllm.platforms import current_platform
 from vllm.utils import get_open_port, update_environment_variables
@@ -425,8 +428,8 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
     # Set random seed for reproducibility
     current_platform.seed_everything(0)
 
-    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    device = f"{current_platform.device_name}:{local_rank}"
+    current_platform.set_device(device)
     torch.set_default_device(device)
 
     update_environment_variables({
@@ -463,3 +466,322 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
 
     # Check that the outputs are close (they should be identical)
     assert torch.allclose(direct_output, sharded_output, rtol=1e-5, atol=1e-5)
+
+
+@pytest.mark.parametrize(
+    "sizes,num_gpus,expected_shuffle_indices,expected_gpu_sample_counts,"
+    "expected_grouped_sizes_per_gpu,test_description",
+    [
+        # Empty input
+        ([], 2, [], [0, 0], [0, 0], "empty input"),
+
+        # Fewer samples than GPUs
+        ([100, 200], 4, [1, 0], [1, 1, 0, 0], [200, 100, 0, 0
+                                               ], "fewer samples than GPUs"),
+
+        # Single GPU
+        ([100, 200, 300], 1, [2, 1, 0], [3], [600], "single GPU"),
+
+        # Balanced assignment
+        ([100, 100, 100, 100
+          ], 2, [0, 2, 1, 3], [2, 2], [200, 200], "balanced assignment"),
+
+        # Unbalanced sizes - this one is trickier since the algorithm is greedy
+        ([1000, 100, 200, 50], 2, [0, 2, 1, 3
+                                   ], [1, 3], [1000, 350], "unbalanced sizes"),
+    ],
+)
+def test_get_load_balance_assignment_cases(sizes, num_gpus,
+                                           expected_shuffle_indices,
+                                           expected_gpu_sample_counts,
+                                           expected_grouped_sizes_per_gpu,
+                                           test_description):
+    """Test get_load_balance_assignment with various input cases."""
+    result = get_load_balance_assignment(sizes, num_gpus=num_gpus)
+    (shuffle_indices, gpu_sample_counts, grouped_sizes_per_gpu) = result
+
+    # Common assertions for all cases
+    assert len(shuffle_indices) == len(sizes)
+    assert len(gpu_sample_counts) == num_gpus
+    assert len(grouped_sizes_per_gpu) == num_gpus
+    assert sum(gpu_sample_counts) == len(sizes)
+
+    assert shuffle_indices == expected_shuffle_indices
+
+    assert gpu_sample_counts == expected_gpu_sample_counts
+    assert grouped_sizes_per_gpu == expected_grouped_sizes_per_gpu
+
+
+class SimpleMRopeVisionModel(torch.nn.Module):
+    """A simple vision model for testing mrope functionality."""
+
+    def __init__(self, spatial_merge_size: int = 2, out_hidden_size: int = 64):
+        super().__init__()
+        self.spatial_merge_size = spatial_merge_size
+        self.out_hidden_size = out_hidden_size
+        self.linear = torch.nn.Linear(768, out_hidden_size)
+
+    def forward(self, pixel_values: torch.Tensor,
+                grid_thw_list: list[list[int]]):
+        """Simple forward pass that simulates spatial merging."""
+        # Apply linear transformation
+        embeddings = self.linear(pixel_values)
+
+        # Simulate spatial merging by reducing the number of patches
+        merge_factor = self.spatial_merge_size * self.spatial_merge_size
+
+        # Group patches and merge spatially
+        merged_embeddings = []
+        start_idx = 0
+
+        for grid_thw in grid_thw_list:
+            num_patches = math.prod(grid_thw)
+            end_idx = start_idx + num_patches
+
+            # Get patches for this image
+            image_patches = embeddings[start_idx:end_idx]
+
+            # Simulate spatial merging by averaging groups of patches
+            merged_patches = num_patches // merge_factor
+            if merged_patches > 0:
+                # Reshape and average to simulate merging
+                reshaped = image_patches[:merged_patches * merge_factor].view(
+                    merged_patches, merge_factor, -1)
+                merged = reshaped.mean(dim=1)
+                merged_embeddings.append(merged)
+
+            start_idx = end_idx
+
+        if merged_embeddings:
+            return torch.cat(merged_embeddings, dim=0)
+        else:
+            return torch.empty((0, self.out_hidden_size),
+                               device=pixel_values.device,
+                               dtype=pixel_values.dtype)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "batch_size",
+    [
+        1,  # Single image
+        3,  # Small batch
+        5,  # Odd batch size (for testing padding)
+    ],
+)
+def test_run_dp_sharded_mrope_vision_model(batch_size: int):
+    world_size = 2
+    # Launch processes
+    mp.spawn(
+        run_dp_sharded_mrope_vision_model_vs_direct,
+        args=(
+            world_size,
+            batch_size,
+            get_open_port(),
+        ),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
+                                                world_size: int,
+                                                batch_size: int,
+                                                master_port: int):
+    """
+    Test that run_dp_sharded_mrope_vision_model produces the same results as 
+    calling the model directly.
+    """
+    # Set random seed for reproducibility
+    current_platform.seed_everything(0)
+    device = f"{current_platform.device_name}:{local_rank}"
+    current_platform.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create test data
+    grid_thw_list = []
+    pixel_values_list = []
+
+    for i in range(batch_size):
+        # Varying image sizes for better testing
+        t, h, w = 1, 4 + i, 4 + i
+        grid_thw_list.append([t, h, w])
+
+        num_patches = t * h * w
+        # Create random pixel values for this image
+        image_pixels = torch.randn(num_patches, 768)
+        pixel_values_list.append(image_pixels)
+
+    # Concatenate all pixel values
+    pixel_values = torch.cat(pixel_values_list, dim=0)
+
+    # Create a simple mrope vision model
+    vision_model = SimpleMRopeVisionModel()
+
+    # Run the model directly on the full input (only on rank 0)
+    if local_rank == 0:
+        with torch.inference_mode():
+            direct_output = vision_model(pixel_values, grid_thw_list)
+
+    # Run the model through the sharded function
+    with torch.inference_mode():
+        sharded_output = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list)
+        sharded_output = torch.cat(sharded_output, dim=0)
+
+    # Check that the world size is setup correctly
+    assert get_tensor_model_parallel_world_size() == world_size
+
+    # Compare outputs (only on rank 0)
+    if local_rank == 0:
+        # Check that the outputs have the same shape
+        assert direct_output.shape == sharded_output.shape
+        # Check that the outputs are close (they should be identical)
+        assert torch.allclose(direct_output,
+                              sharded_output,
+                              rtol=1e-5,
+                              atol=1e-5)
+
+
+@multi_gpu_test(num_gpus=2)
+def test_run_dp_sharded_mrope_vision_model_empty_input():
+    world_size = 2
+    mp.spawn(
+        run_dp_sharded_mrope_vision_model_empty_input_worker,
+        args=(world_size, get_open_port()),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_mrope_vision_model_empty_input_worker(
+        local_rank: int, world_size: int, master_port: int):
+    """Test run_dp_sharded_mrope_vision_model with empty input."""
+    # Set up distributed environment
+    device = f"{current_platform.device_name}:{local_rank}"
+    current_platform.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create empty inputs
+    pixel_values = torch.empty((0, 768))
+    grid_thw_list: list[list[int]] = []
+
+    vision_model = SimpleMRopeVisionModel()
+
+    # Should handle empty input gracefully
+    with torch.inference_mode():
+        output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values,
+                                                   grid_thw_list)
+
+    assert len(output) == 0
+
+
+@multi_gpu_test(num_gpus=4)
+def test_run_dp_sharded_mrope_vision_model_uneven_load():
+    world_size = 4
+    mp.spawn(
+        run_dp_sharded_mrope_vision_model_uneven_load_worker,
+        args=(world_size, get_open_port()),
+        nprocs=world_size,
+    )
+
+
+def run_dp_sharded_mrope_vision_model_uneven_load_worker(
+        local_rank: int, world_size: int, master_port: int):
+    """Test run_dp_sharded_mrope_vision_model with uneven load distribution."""
+    # Set up distributed environment
+    current_platform.seed_everything(123)
+    device = f"{current_platform.device_name}:{local_rank}"
+    current_platform.set_device(device)
+    torch.set_default_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': str(master_port),
+    })
+
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create images with very different sizes
+    grid_thw_list = [
+        [1, 2, 2],  # Small: 4 patches
+        [1, 8, 8],  # Large: 64 patches  
+        [1, 3, 3],  # Medium: 9 patches
+    ]
+
+    pixel_values_list = []
+    for grid_thw in grid_thw_list:
+        num_patches = math.prod(grid_thw)
+        image_pixels = torch.randn(num_patches, 768)
+        pixel_values_list.append(image_pixels)
+
+    pixel_values = torch.cat(pixel_values_list, dim=0)
+    vision_model = SimpleMRopeVisionModel()
+
+    # Should handle uneven distribution without errors
+    with torch.inference_mode():
+        output_tuple = run_dp_sharded_mrope_vision_model(
+            vision_model, pixel_values, grid_thw_list)
+
+    # Verify output shape is reasonable
+    merge_factor = vision_model.spatial_merge_size**2
+    expected_output_patches = list(
+        math.prod(grid_thw) // merge_factor for grid_thw in grid_thw_list)
+
+    for i, output in enumerate(output_tuple):
+        assert output.shape[0] == expected_output_patches[i]
+        assert output.shape[1] == vision_model.out_hidden_size
+
+
+@pytest.mark.parametrize("spatial_merge_size", [2, 4])
+def test_simple_mrope_vision_model_spatial_merge(spatial_merge_size: int):
+    """Test SimpleMRopeVisionModel with different spatial merge sizes."""
+    device = current_platform.device_type
+
+    grid_thw_list = [[1, 4, 4], [1, 6, 6]]  # Two images
+    pixel_values_list = []
+
+    for grid_thw in grid_thw_list:
+        num_patches = math.prod(grid_thw)
+        image_pixels = torch.randn(num_patches, 768, device=device)
+        pixel_values_list.append(image_pixels)
+
+    pixel_values = torch.cat(pixel_values_list, dim=0)
+    vision_model = SimpleMRopeVisionModel(
+        spatial_merge_size=spatial_merge_size).to(device)
+
+    with torch.inference_mode():
+        output = vision_model(pixel_values, grid_thw_list)
+
+    # Verify output dimensions based on spatial merging
+    total_patches = sum(math.prod(grid_thw) for grid_thw in grid_thw_list)
+    merge_factor = spatial_merge_size**2
+    expected_output_patches = total_patches // merge_factor
+
+    assert output.shape[0] == expected_output_patches
+    assert output.shape[1] == vision_model.out_hidden_size
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 671ad9eed234..d3b6b2089f42 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -437,7 +437,7 @@ def weight_loader(self,
             shard_offset = sum(self.output_sizes[:loaded_shard_id])
             shard_size = self.output_sizes[loaded_shard_id]
 
-        param[shard_offset:shard_offset + shard_size] = loaded_weight
+        param.data[shard_offset:shard_offset + shard_size] = loaded_weight
 
 
 @CustomOp.register("column_parallel_linear")
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 5bcbcc4f0e37..34eec10296b5 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -45,10 +45,14 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
+# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
+                                               MergedReplicatedLinear,
                                                QKVParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
+# yapf: enable
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
@@ -57,6 +61,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.multimodal.utils import run_dp_sharded_mrope_vision_model
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
@@ -170,19 +175,25 @@ def __init__(self,
                  bias: bool = False,
                  act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 use_data_parallel: bool = False):
         super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
+        cls_gate_up_proj = (MergedReplicatedLinear if use_data_parallel else
+                            MergedColumnParallelLinear)
+        self.gate_up_proj = cls_gate_up_proj(
             input_size=in_features,
             output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
             bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(hidden_features,
-                                           in_features,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           prefix=f"{prefix}.down_proj")
+
+        cls_down_proj = (ReplicatedLinear
+                         if use_data_parallel else RowParallelLinear)
+        self.down_proj = cls_down_proj(hidden_features,
+                                       in_features,
+                                       bias=bias,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.down_proj")
         self.act_fn = act_fn
 
     def forward(self, x: torch.Tensor):
@@ -220,28 +231,42 @@ def __init__(
         projection_size: int,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
-        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_size = (1 if use_data_parallel else
+                        parallel_state.get_tensor_model_parallel_world_size())
         self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
             num_heads, self.tp_size)
 
-        self.qkv = QKVParallelLinear(
-            hidden_size=embed_dim,
-            head_size=self.hidden_size_per_attention_head,
-            total_num_heads=num_heads,
-            total_num_kv_heads=num_heads,
-            bias=True,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv")
-        self.proj = RowParallelLinear(input_size=projection_size,
-                                      output_size=embed_dim,
-                                      quant_config=quant_config,
-                                      prefix=f"{prefix}.proj")
+        if use_data_parallel:
+            self.qkv = ReplicatedLinear(embed_dim,
+                                        self.hidden_size_per_attention_head *
+                                        3 * num_heads,
+                                        bias=True,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.qkv")
+
+        else:
+            self.qkv = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.hidden_size_per_attention_head,
+                total_num_heads=num_heads,
+                total_num_kv_heads=num_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv")
+
+        cls_proj = (ReplicatedLinear
+                    if use_data_parallel else RowParallelLinear)
+        self.proj = cls_proj(input_size=projection_size,
+                             output_size=embed_dim,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.proj")
 
         # Detect attention implementation.
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
@@ -302,8 +327,6 @@ def forward(
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
 
         if self.is_flash_attn_backend:
-            # from vllm_flash_attn.flash_attn_interface import (
-            #   flash_attn_varlen_func)
             if self.attn_backend == _Backend.ROCM_AITER_FA:
                 from aiter import flash_attn_varlen_func
             else:
@@ -370,23 +393,27 @@ def __init__(
         norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
         self.norm1 = norm_layer(dim)
         self.norm2 = norm_layer(dim)
-        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
-                                            num_heads=num_heads,
-                                            projection_size=dim,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.attn")
+        self.attn = Qwen2_5_VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            use_data_parallel=use_data_parallel)
         self.mlp = Qwen2_5_VisionMLP(dim,
                                      mlp_hidden_dim,
                                      act_fn=act_fn,
                                      bias=True,
                                      quant_config=quant_config,
-                                     prefix=f"{prefix}.mlp")
+                                     prefix=f"{prefix}.mlp",
+                                     use_data_parallel=use_data_parallel)
 
     def forward(
             self,
@@ -445,24 +472,30 @@ def __init__(
         spatial_merge_size: int = 2,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = context_dim * (spatial_merge_size**2)
         if norm_layer is None:
             norm_layer = partial(nn.LayerNorm, eps=1e-6)
         self.ln_q = norm_layer(context_dim)
+
+        cls_fc1 = (ReplicatedLinear
+                   if use_data_parallel else ColumnParallelLinear)
+        cls_fc2 = (ReplicatedLinear
+                   if use_data_parallel else RowParallelLinear)
         self.mlp = nn.ModuleList([
-            ColumnParallelLinear(self.hidden_size,
-                                 self.hidden_size,
-                                 bias=True,
-                                 quant_config=quant_config,
-                                 prefix=f"{prefix}.mlp.0"),
+            cls_fc1(self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp.0"),
             nn.GELU(),
-            RowParallelLinear(self.hidden_size,
-                              d_model,
-                              bias=True,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.mlp.2"),
+            cls_fc2(self.hidden_size,
+                    d_model,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.mlp.2"),
         ])
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -514,6 +547,7 @@ def __init__(
         norm_eps: float = 1e-6,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        use_data_parallel: bool = False,
     ) -> None:
         super().__init__()
 
@@ -523,6 +557,8 @@ def __init__(
         depth = vision_config.depth
         self.hidden_size = vision_config.hidden_size
         self.num_heads = vision_config.num_heads
+        self.use_data_parallel = use_data_parallel
+        self.out_hidden_size = vision_config.out_hidden_size
 
         # args for get_window_index_thw
         self.window_size = vision_config.window_size
@@ -550,7 +586,8 @@ def __init__(
                                     vision_config.hidden_act),
                                 norm_layer=norm_layer,
                                 quant_config=quant_config,
-                                prefix=f"{prefix}.blocks.{layer_idx}")
+                                prefix=f"{prefix}.blocks.{layer_idx}",
+                                use_data_parallel=use_data_parallel)
             for layer_idx in range(depth)
         ])
         self.merger = Qwen2_5_VisionPatchMerger(
@@ -560,6 +597,7 @@ def __init__(
             spatial_merge_size=self.spatial_merge_size,
             quant_config=quant_config,
             prefix=f"{prefix}.merger",
+            use_data_parallel=use_data_parallel,
         )
         self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
 
@@ -767,7 +805,6 @@ def load_weights(self, weights: Iterable[tuple[str,
                 if weight_name not in name:
                     continue
                 name = name.replace(weight_name, param_name)
-
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -840,6 +877,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
 
+        self.use_data_parallel = (vllm_config.parallel_config.
+                                  enable_multimodal_encoder_data_parallel)
         self.config = config
         self.multimodal_config = multimodal_config
 
@@ -851,6 +890,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 quant_config=self._maybe_ignore_quant_config(
                     self.quant_config),
                 prefix=maybe_prefix(prefix, "visual"),
+                use_data_parallel=self.use_data_parallel,
             )
         else:
             self.visual = None
@@ -973,7 +1013,13 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"]
-            image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values, grid_thw_list)
+            else:
+                image_embeds = self.visual(pixel_values,
+                                           grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
         # Using prod on grid_thw_list instead of grid_thw.prod avoids CUDA sync
@@ -995,8 +1041,12 @@ def _process_video_input(
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
-            video_embeds = self.visual(pixel_values_videos,
-                                       grid_thw=grid_thw_list)
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual, pixel_values_videos, grid_thw_list)
+            else:
+                video_embeds = self.visual(pixel_values_videos,
+                                           grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 3361878a20d8..2315fe2ab92b 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -329,8 +329,6 @@ def forward(
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
 
         if self.is_flash_attn_backend:
-            # from vllm_flash_attn.flash_attn_interface import (
-            #   flash_attn_varlen_func)
             if self.attn_backend == _Backend.ROCM_AITER_FA:
                 from aiter import flash_attn_varlen_func
             else:
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 99f3db25a71d..58c71d865dc7 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,6 +3,8 @@
 
 import asyncio
 import atexit
+import itertools
+import math
 from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
@@ -465,6 +467,219 @@ def run_dp_sharded_vision_model(image_input: torch.Tensor,
     return vision_embeddings
 
 
+def get_load_balance_assignment(
+    sizes: list[int],
+    num_gpus: int = 2,
+) -> tuple[list[int], list[int], list[int]]:
+    """
+    Generate load balancing assignment and metadata 
+    for distributing data across GPUs.
+    The load is determined by the total image sizes,
+    not the number of images.
+    
+    Args:
+        sizes: The size of each image
+        num_gpus: Number of GPUs to balance across
+    
+    Returns:
+        shuffle_indices: 
+            Indices to reorder data for balanced loading
+        gpu_sample_counts: 
+            Number of samples assigned to each GPU
+        grouped_sizes_per_gpu: 
+            Total size assigned to each GPU
+    
+    Example:
+        ```
+        sizes = [1000, 100, 200, 50]
+        num_gpus=2
+        ```
+
+    """
+
+    n_samples = len(sizes)
+
+    # Handle edge cases
+    if n_samples == 0:
+        return [], [0] * num_gpus, [0] * num_gpus
+
+    # Use greedy algorithm - balance by total size, not sample count
+    gpu_assignments = [list[int]() for _ in range(num_gpus)]
+    gpu_loads = [0] * num_gpus  # This tracks total SIZE, not sample count
+
+    # Sort indices by size (largest first for better load balancing)
+    # sizes = [1000, 100, 200, 50]
+    # large_to_small_indices = [0, 2, 1, 3]
+    large_to_small_indices = sorted(range(n_samples),
+                                    key=lambda i: sizes[i],
+                                    reverse=True)
+
+    for idx in large_to_small_indices:
+        # Find GPU with minimum current load (by total size)
+        min_gpu = min(range(num_gpus), key=lambda i: gpu_loads[i])
+        gpu_assignments[min_gpu].append(idx)
+        gpu_loads[min_gpu] += sizes[idx]
+
+    # Create shuffle indices and counts
+    shuffle_indices = list[int]()
+    gpu_sample_counts = list[int]()
+    for gpu_id in range(num_gpus):
+        # GPU_0 = [1000] = [0]
+        # GPU_1 = [200, 100, 50] = [2, 1, 3]
+        # shuffle_indices = [0, 2, 1, 3]
+        shuffle_indices.extend(gpu_assignments[gpu_id])
+        # GPU_0 = [1]
+        # GPU_1 = [3]
+        # gpu_sample_counts = [1, 3]
+        gpu_sample_counts.append(len(gpu_assignments[gpu_id]))
+
+    return (shuffle_indices, gpu_sample_counts, gpu_loads)
+
+
+def run_dp_sharded_mrope_vision_model(
+    vision_model: torch.nn.Module,
+    pixel_values: torch.Tensor,
+    grid_thw_list: list[list[int]],
+) -> tuple[torch.Tensor, ...]:
+    """Run a vision model with data parallelism (DP) sharding. 
+    The function will shard the input image tensor on the 
+    first dimension and run the vision model.
+    This function is used to run the vision model with mrope.
+    
+    Args:
+        vision_model (torch.nn.Module): Vision model.
+        pixel_values (torch.Tensor): Image/Video input tensor.
+        grid_thw_list: List of grid dimensions for each image
+    Returns:
+        torch.Tensor: Output image embeddings
+
+    Example:
+        ```
+        vision_model.out_hidden_size = 64
+        vision_model.spatial_merge_size = 2
+        pixel_values.shape = (1350, channel)
+        grid_thw_list = [[1, 10, 100], [1, 10, 10], [1, 10, 20], [1, 50]]
+        tp_size=2
+        ```
+
+    """
+    tp_size = get_tensor_model_parallel_world_size()
+
+    # GPU_0 tp_rank_local = 0
+    # GPU_1 tp_rank_local = 1
+    tp_rank_local = get_tensor_model_parallel_rank()
+
+    # patches_per_image = [1000, 100, 200, 50]
+    patches_per_image = [math.prod(grid_thw) for grid_thw in grid_thw_list]
+    # patches_per_image = [0, 1000, 1100, 1300, 1350]
+    cum_patches_per_image = [0, *itertools.accumulate(patches_per_image)]
+
+    # Get load balancing assignment with all metadata
+    # image_to_tp_rank = [0, 2, 1, 3]
+    # gpu_sample_counts = [1, 3]
+    # grouped_pixel_values_len = [1000, 350]
+    (image_to_tp_rank, gpu_sample_counts,
+     grouped_pixel_values_len) = get_load_balance_assignment(
+         patches_per_image, tp_size)
+
+    # cu_gpu_sample_counts = [0, 1, 4]
+    cum_gpu_sample_counts = [0, *itertools.accumulate(gpu_sample_counts)]
+
+    # GPU_0 image_idxs_local = [0]
+    # GPU_1 image_idxs_local = [2, 1, 3]
+    image_idxs_local = image_to_tp_rank[cum_gpu_sample_counts[tp_rank_local]:
+                                        cum_gpu_sample_counts[tp_rank_local +
+                                                              1]]
+
+    # Get the pixel values for the local images based on the image_idxs_local
+    if len(image_idxs_local) > 0:
+        pixel_values_local = torch.cat([
+            pixel_values[cum_patches_per_image[i]:cum_patches_per_image[i + 1]]
+            for i in image_idxs_local
+        ])
+    else:
+        # Handle case where this rank has no images
+        pixel_values_local = torch.empty((0, pixel_values.shape[1]),
+                                         device=pixel_values.device,
+                                         dtype=pixel_values.dtype)
+    # embed_dim_reduction_factor = 2 * 2
+    embed_dim_reduction_factor = (vision_model.spatial_merge_size *
+                                  vision_model.spatial_merge_size)
+
+    # Find the max length across all ranks
+    # The output embedding of every DP rank has to be
+    # padded to this length for tensor_model_parallel_all_gather
+    # to work
+    max_len_per_rank = max(
+        grouped_pixel_values_len) // embed_dim_reduction_factor
+    local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
+
+    # Run the vision model on the local pixel_values_local
+    if pixel_values_local.shape[0] > 0:
+        image_embeds_local = vision_model(pixel_values_local,
+                                          local_grid_thw_list)
+    else:
+        # Handle empty case
+        image_embeds_local = torch.empty((0, vision_model.out_hidden_size),
+                                         device=pixel_values.device,
+                                         dtype=pixel_values.dtype)
+
+    # Pad the output based on max_len_per_rank
+    # for tensor_model_parallel_all_gather to work
+    current_len = image_embeds_local.shape[0]
+    if current_len < max_len_per_rank:
+        padding_size = max_len_per_rank - current_len
+        padding = torch.empty((padding_size, image_embeds_local.shape[1]),
+                              dtype=image_embeds_local.dtype,
+                              device=image_embeds_local.device)
+        image_embeds_local_padded = torch.cat([image_embeds_local, padding],
+                                              dim=0)
+    else:
+        image_embeds_local_padded = image_embeds_local
+
+    # Do all_gather to collect embeddings from all ranks
+    gathered_embeds = tensor_model_parallel_all_gather(
+        image_embeds_local_padded, dim=0)
+
+    # Remove padding and reconstruct per-rank embeddings
+    rank_embeddings = list[torch.Tensor]()
+    for rank in range(tp_size):
+        start_idx = rank * max_len_per_rank
+        end_idx = start_idx + (grouped_pixel_values_len[rank] //
+                               embed_dim_reduction_factor)
+        rank_embeddings.append(gathered_embeds[start_idx:end_idx])
+
+    patches_per_output_image = [(patch_size // embed_dim_reduction_factor)
+                                for patch_size in patches_per_image]
+
+    # Reconstruct embeddings in the original order
+    original_order_embeddings = [None] * len(grid_thw_list)
+    current_idx = 0
+    for rank in range(tp_size):
+        count = gpu_sample_counts[rank]
+        if count > 0:
+            # Get images assigned to this rank in shuffled order
+            # GPU_0 = image_idxs_local  [0]
+            # GPU_1 = image_idxs_local  [2, 1, 3]
+            rank_images = image_to_tp_rank[current_idx:current_idx + count]
+
+            rank_embed = rank_embeddings[rank]
+            # Split rank embeddings back to individual images
+            embed_start = 0
+            for img_idx in rank_images:
+                img_patches = patches_per_output_image[img_idx]
+                original_order_embeddings[img_idx] = rank_embed[
+                    embed_start:embed_start + img_patches]
+                embed_start += img_patches
+            current_idx += count
+
+    out_embeddings = tuple(embed for embed in original_order_embeddings
+                           if embed is not None)
+    assert len(out_embeddings) == len(
+        original_order_embeddings), "Found unassigned embeddings"
+    return out_embeddings
+
+
 def fetch_audio(
     audio_url: str,
     audio_io_kwargs: Optional[dict[str, Any]] = None,

From 1e5610fc1cf18081d1a98e4397ebf2c4f700c8bc Mon Sep 17 00:00:00 2001
From: yiz-liu <136800916+yiz-liu@users.noreply.github.com>
Date: Wed, 20 Aug 2025 00:18:41 +0800
Subject: [PATCH 196/233] [Model] Removes redundant all-reduce operation in
 Qwen3MoeSparseMoeBlock (#23169)

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
---
 vllm/model_executor/models/qwen3_moe.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 61b16b6a1d2d..05bbb0d2e899 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -139,7 +139,7 @@ def __init__(
                                 top_k=config.num_experts_per_tok,
                                 hidden_size=config.hidden_size,
                                 intermediate_size=config.moe_intermediate_size,
-                                reduce_results=False,
+                                reduce_results=True,
                                 renormalize=config.norm_topk_prob,
                                 quant_config=quant_config,
                                 prefix=f"{prefix}.experts",
@@ -163,10 +163,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         final_hidden_states = self.experts(hidden_states=hidden_states,
                                            router_logits=router_logits)
 
-        if self.tp_size > 1:
-            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(  # noqa E501
-                final_hidden_states)
-
         return final_hidden_states.view(orig_shape)
 
 
From 427704a3977627a54a91c6c5ab23b52ad1a4eb85 Mon Sep 17 00:00:00 2001
From: Yuge Zhang <scottyugochang@gmail.com>
Date: Wed, 20 Aug 2025 00:48:31 +0800
Subject: [PATCH 197/233] Add return_token_ids parameter to OpenAI API
 endpoints (#22587)

Signed-off-by: Yuge Zhang <scottyugochang@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 .../entrypoints/openai/test_openai_schema.py  |  63 +--
 .../openai/test_return_token_ids.py           | 374 ++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           |  30 ++
 vllm/entrypoints/openai/serving_chat.py       |  22 +-
 vllm/entrypoints/openai/serving_completion.py |  21 +-
 5 files changed, 480 insertions(+), 30 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_return_token_ids.py

diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 246bd014aa69..11ed1c4a9ee4 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -74,31 +74,44 @@ def no_invalid_types(case: schemathesis.models.Case):
             -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
             http://localhost:8000/v1/chat/completions
         """  # noqa: E501
-        if (hasattr(case, "body") and isinstance(case.body, dict)
-                and "messages" in case.body
-                and isinstance(case.body["messages"], list)
-                and len(case.body["messages"]) > 0):
-
-            for message in case.body["messages"]:
-                if not isinstance(message, dict):
-                    continue
-
-                # Check for invalid file type in tokenize endpoint
-                if op.method.lower() == "post" and op.path == "/tokenize":
-                    content = message.get("content", [])
-                    if (isinstance(content, list) and len(content) > 0 and any(
-                            item.get("type") == "file" for item in content)):
-                        return False
-
-                # Check for invalid tool_calls with non-function types
-                tool_calls = message.get("tool_calls", [])
-                if isinstance(tool_calls, list):
-                    for tool_call in tool_calls:
-                        if isinstance(tool_call, dict):
-                            if tool_call.get("type") != "function":
-                                return False
-                            if "custom" in tool_call:
-                                return False
+        if hasattr(case, "body") and isinstance(case.body, dict):
+            if ("messages" in case.body
+                    and isinstance(case.body["messages"], list)
+                    and len(case.body["messages"]) > 0):
+
+                for message in case.body["messages"]:
+                    if not isinstance(message, dict):
+                        continue
+
+                    # Check for invalid file type in tokenize endpoint
+                    if op.method.lower() == "post" and op.path == "/tokenize":
+                        content = message.get("content", [])
+                        if (isinstance(content, list) and len(content) > 0
+                                and any(
+                                    item.get("type") == "file"
+                                    for item in content)):
+                            return False
+
+                    # Check for invalid tool_calls with non-function types
+                    tool_calls = message.get("tool_calls", [])
+                    if isinstance(tool_calls, list):
+                        for tool_call in tool_calls:
+                            if isinstance(tool_call, dict):
+                                if tool_call.get("type") != "function":
+                                    return False
+                                if "custom" in tool_call:
+                                    return False
+
+            # Sometimes guided_grammar is generated to be empty
+            # Causing a server error in EBNF grammar parsing
+            # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
+            guided_grammar = case.body.get("guided_grammar")
+
+            if guided_grammar == '':
+                # Allow None (will be handled as no grammar)
+                # But skip empty strings
+                return False
+
         return True
 
     return strategy.filter(no_invalid_types)
diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py
new file mode 100644
index 000000000000..6addcb41c409
--- /dev/null
+++ b/tests/entrypoints/openai/test_return_token_ids.py
@@ -0,0 +1,374 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enable-auto-tool-choice",
+        "--tool-call-parser",
+        "hermes",
+        "--enforce-eager",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_basic_completion_with_emoji(server):
+    """Test basic completion with emoji to verify token_ids field."""
+    async with server.get_async_client() as client:
+        # Test with return_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Complete this sentence with emojis: I love coding 🚀",
+            max_tokens=10,
+            temperature=0,
+            logprobs=1,
+            extra_body={"return_token_ids": True},
+        )
+
+        # Check the raw response to see the structure
+        completion_dict = completion.model_dump()
+
+        # Verify prompt_token_ids field is present in the completion response
+        assert "prompt_token_ids" in completion_dict["choices"][0]
+        assert isinstance(completion.choices[0].prompt_token_ids, list)
+
+        # Check against the expected prompt token IDs
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        encoded_tokens = tokenizer.encode(
+            "Complete this sentence with emojis: I love coding 🚀")
+        # Check that encoded_tokens is a subsequence of prompt_token_ids
+        assert any(completion.choices[0].prompt_token_ids[i:i +
+                                                          len(encoded_tokens)]
+                   == encoded_tokens for i in range(
+                       len(completion.choices[0].prompt_token_ids) -
+                       len(encoded_tokens) + 1))
+
+        # Verify token_ids field is present in the choice
+        assert completion.choices[0].token_ids is not None
+        assert isinstance(completion.choices[0].token_ids, list)
+        assert len(completion.choices[0].token_ids) > 0
+
+        # Verify decoding works correctly
+        decoded_text = tokenizer.decode(completion.choices[0].token_ids)
+        # The decoded text should contain a <|im_end|> at the end
+        assert decoded_text.startswith(completion.choices[0].text)
+
+        # Test without return_token_ids (should be None)
+        completion_without = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Complete this sentence with emojis: I love coding 🚀",
+            max_tokens=10,
+            temperature=0,
+            logprobs=1,
+            extra_body={"return_token_ids": False},
+        )
+
+        completion_without_dict = completion_without.model_dump()
+        assert completion_without_dict["choices"][0].get("token_ids") is None
+        assert completion_without_dict.get("prompt_token_ids") is None
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_tool_use(server):
+    """Test chat completion with tool use (get_weather function)."""
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ["celsius", "fahrenheit"],
+                        "description": "The unit of temperature",
+                    },
+                },
+                "required": ["location"],
+            },
+        },
+    }]
+
+    async with server.get_async_client() as client:
+        # Test with return_token_ids enabled
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris?"
+                },
+            ],
+            tools=tools,
+            tool_choice="auto",
+            max_tokens=100,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        # Verify token_ids field is present in choices
+        assert response.choices[0].token_ids is not None
+        assert isinstance(response.choices[0].token_ids, list)
+
+        # Verify prompt_token_ids field is present
+        assert response.prompt_token_ids is not None
+        assert isinstance(response.prompt_token_ids, list)
+
+        # Verify the prompt texts and response texts
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+        prompt_text = tokenizer.decode(response.prompt_token_ids)
+        assert prompt_text.startswith(
+            "<|im_start|>system\nYou are a helpful assistant.")
+        assert prompt_text.endswith(
+            "What's the weather like in Paris?<|im_end|>\n"
+            "<|im_start|>assistant\n")
+
+        response_text = tokenizer.decode(response.choices[0].token_ids)
+        assert response_text.startswith('<tool_call>\n{"name": "get_weather"')
+        assert response_text.endswith("</tool_call><|im_end|>")
+
+        # If tool call was made, verify the response structure
+        if response.choices[0].message.tool_calls:
+            assert len(response.choices[0].message.tool_calls) > 0
+            tool_call = response.choices[0].message.tool_calls[0]
+            assert tool_call.function.name == "get_weather"
+
+        # Test without return_token_ids
+        response_without = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like in Paris?"
+                },
+            ],
+            tools=tools,
+            tool_choice="auto",
+            max_tokens=100,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": False},
+        )
+
+        assert response_without.choices[0].token_ids is None
+        assert response_without.prompt_token_ids is None
+
+
+@pytest.mark.asyncio
+async def test_comparison_with_prompt_logprobs_and_logprobs(server):
+    """
+    Test that token_ids align with prompt_logprobs and
+    logprobs when return_tokens_as_token_ids is enabled.
+    """
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Hello, world! How are you today?",
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            logprobs=1,
+            extra_body={
+                "return_token_ids": True,
+                "return_tokens_as_token_ids": True,
+                "prompt_logprobs": 1
+            },
+        )
+
+        # Verify all fields are present
+        assert completion.choices[0].token_ids is not None
+        assert completion.choices[0].prompt_token_ids is not None
+        assert completion.choices[0].prompt_logprobs is not None
+        assert completion.choices[0].logprobs is not None
+
+        # Extract token IDs from logprobs
+        # (when return_tokens_as_token_ids is True)
+        logprobs_token_ids = []
+        for token_str in completion.choices[0].logprobs.tokens:
+            # Token format is "token_id:12345" when
+            # return_tokens_as_token_ids is True
+            if token_str.startswith("token_id:"):
+                token_id = int(token_str.removeprefix("token_id:"))
+                logprobs_token_ids.append(token_id)
+
+        # When echo=True, the logprobs include both prompt and response tokens
+        # The token_ids field should match the the suffix of response portion
+        # The prompt_token_ids should match the prompt portion
+        assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
+        response_token_ids_length = len(completion.choices[0].token_ids)
+        assert logprobs_token_ids[-response_token_ids_length:] == \
+            completion.choices[0].token_ids
+
+        # Verify tokenizer consistency
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(
+                completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert "Hello, world" in prompt_text
+
+        # Decode response tokens
+        if completion.choices[0].token_ids:
+            response_text = tokenizer.decode(completion.choices[0].token_ids)
+            assert completion.choices[0].text.endswith(response_text)
+
+        # Test streaming mode
+        stream = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Tell me a short fact about Python:",
+            max_tokens=30,
+            temperature=0,
+            stream=True,
+            echo=False,
+            logprobs=1,
+            extra_body={
+                "return_token_ids": True,
+                "return_tokens_as_token_ids": True
+            },
+        )
+
+        # Collect streamed tokens
+        streamed_prompt_token_ids = []
+        streamed_token_ids = []
+        streamed_logprob_token_ids = []
+        first_chunk = True
+        async for chunk in stream:
+            for token_str in chunk.choices[0].logprobs.tokens:
+                # Token format is "token_id:12345" when
+                # return_tokens_as_token_ids is True
+                if token_str.startswith("token_id:"):
+                    token_id = int(token_str.removeprefix("token_id:"))
+                    streamed_logprob_token_ids.append(token_id)
+            if first_chunk:
+                streamed_prompt_token_ids = chunk.choices[0].prompt_token_ids
+                first_chunk = False
+            streamed_token_ids += chunk.choices[0].token_ids
+
+        # Verify we collected some tokens and first chunk had prompt_token_ids
+        assert len(streamed_prompt_token_ids) > 0
+        assert streamed_token_ids == streamed_logprob_token_ids
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_emoji_and_token_ids(server):
+    """Test chat completion with emojis to verify token_ids handling."""
+    chat_messages = [
+        {
+            "role": "system",
+            "content": "You like to use emojis in your responses."
+        },
+        {
+            "role": "user",
+            "content": "Repeat after me: I love cats 🐱"
+        },
+    ]
+    async with server.get_async_client() as client:
+        response = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=chat_messages,
+            max_tokens=50,
+            temperature=0,
+            logprobs=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        # Verify token_ids are present
+        response_dict = response.model_dump()
+        assert response.choices[0].token_ids is not None
+        assert "prompt_token_ids" in response_dict
+
+        # Verify the response contains the expected fields
+        assert response.choices[0].message.content is not None
+
+        # Decode token_ids and verify consistency
+        tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+        decoded_prompt = tokenizer.decode(response.prompt_token_ids)
+        assert decoded_prompt.startswith(
+            "<|im_start|>system\nYou like to use emojis in your responses.")
+        assert decoded_prompt.endswith(
+            "I love cats 🐱<|im_end|>\n<|im_start|>assistant\n")
+
+        decoded_response = tokenizer.decode(response.choices[0].token_ids)
+        # The content should match the response text
+        # except the ending <|im_end|>
+        assert decoded_response == response.choices[
+            0].message.content + "<|im_end|>"
+
+        # Test with streaming
+        stream = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=chat_messages,
+            max_tokens=50,
+            temperature=0,
+            stream=True,
+            extra_body={"return_token_ids": True},
+        )
+
+        collected_content = ""
+        collected_token_ids = []
+        first_chunk = True
+
+        async for chunk in stream:
+            if first_chunk:
+                assert chunk.prompt_token_ids is not None
+                assert isinstance(chunk.prompt_token_ids, list)
+                # Check the prompt_token_ids match the initial prompt
+                decoded_prompt_stream = tokenizer.decode(
+                    chunk.prompt_token_ids)
+                assert decoded_prompt_stream == decoded_prompt
+                first_chunk = False
+            else:
+                chunk_dump = chunk.model_dump()
+                assert "prompt_token_ids" not in chunk_dump, \
+                    "Subsequent chunks should not have prompt_token_ids"
+
+            if chunk.choices:
+                if chunk.choices[0].delta.content:
+                    collected_content += chunk.choices[0].delta.content
+                # token_ids may not present in all chunks
+                choice_dump = chunk.choices[0].model_dump()
+                if "token_ids" in choice_dump:
+                    collected_token_ids.extend(chunk.choices[0].token_ids)
+
+        # Verify we got response and token_ids
+        assert len(collected_content) > 0
+        assert len(collected_token_ids) > 0
+
+        # Verify token_ids decode properly
+        decoded_response = tokenizer.decode(collected_token_ids)
+        assert decoded_response == collected_content + "<|im_end|>"
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 61f1a09d3ac1..39facd4d53d3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -576,6 +576,14 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
+    return_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified, the result will include token IDs alongside the "
+            "generated text. In streaming mode, prompt_token_ids is included "
+            "only in the first chunk, and token_ids contains the delta tokens "
+            "for each chunk. This is useful for debugging or when you "
+            "need to map generated text back to input tokens."))
     cache_salt: Optional[str] = Field(
         default=None,
         description=(
@@ -1062,6 +1070,14 @@ class CompletionRequest(OpenAIBaseModel):
             "If specified with 'logprobs', tokens are represented "
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
+    return_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified, the result will include token IDs alongside the "
+            "generated text. In streaming mode, prompt_token_ids is included "
+            "only in the first chunk, and token_ids contains the delta tokens "
+            "for each chunk. This is useful for debugging or when you "
+            "need to map generated text back to input tokens."))
 
     cache_salt: Optional[str] = Field(
         default=None,
@@ -1480,7 +1496,9 @@ class CompletionResponseChoice(OpenAIBaseModel):
             "to stop, None if the completion finished for some other reason "
             "including encountering the EOS token"),
     )
+    token_ids: Optional[list[int]] = None  # For response
     prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
+    prompt_token_ids: Optional[list[int]] = None  # For prompt
 
 
 class CompletionResponse(OpenAIBaseModel):
@@ -1511,6 +1529,10 @@ class CompletionResponseStreamChoice(OpenAIBaseModel):
             "to stop, None if the completion finished for some other reason "
             "including encountering the EOS token"),
     )
+    # not part of the OpenAI spec but for tracing the tokens
+    # prompt tokens is put into choice to align with CompletionResponseChoice
+    prompt_token_ids: Optional[list[int]] = None
+    token_ids: Optional[list[int]] = None
 
 
 class CompletionStreamResponse(OpenAIBaseModel):
@@ -1680,6 +1702,9 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
     finish_reason: Optional[str] = "stop"
     # not part of the OpenAI spec but included in vLLM for legacy reasons
     stop_reason: Optional[Union[int, str]] = None
+    # not part of the OpenAI spec but is useful for tracing the tokens
+    # in agent scenarios
+    token_ids: Optional[list[int]] = None
 
 
 class ChatCompletionResponse(OpenAIBaseModel):
@@ -1695,6 +1720,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
 
     # vLLM-specific fields that are not in OpenAI spec
     prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
+    prompt_token_ids: Optional[list[int]] = None
     kv_transfer_params: Optional[dict[str, Any]] = Field(
         default=None, description="KVTransfer parameters.")
 
@@ -1712,6 +1738,8 @@ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
     logprobs: Optional[ChatCompletionLogProbs] = None
     finish_reason: Optional[str] = None
     stop_reason: Optional[Union[int, str]] = None
+    # not part of the OpenAI spec but for tracing the tokens
+    token_ids: Optional[list[int]] = None
 
 
 class ChatCompletionStreamResponse(OpenAIBaseModel):
@@ -1721,6 +1749,8 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     model: str
     choices: list[ChatCompletionResponseStreamChoice]
     usage: Optional[UsageInfo] = Field(default=None)
+    # not part of the OpenAI spec but for tracing the tokens
+    prompt_token_ids: Optional[list[int]] = None
 
 
 class TranscriptionResponseStreamChoice(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 12349234c320..1789521afc84 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -568,12 +568,17 @@ async def chat_completion_stream_generator(
                             ),
                             logprobs=None,
                             finish_reason=None)
+
+                        # return prompt_token_ids at the first chunk ever
                         chunk = ChatCompletionStreamResponse(
                             id=request_id,
                             object=chunk_object_type,
                             created=created_time,
                             choices=[choice_data],
-                            model=model_name)
+                            model=model_name,
+                            prompt_token_ids=(res.prompt_token_ids
+                                              if request.return_token_ids else
+                                              None))
 
                         # if continuous usage stats are requested, add it
                         if include_continuous_usage:
@@ -912,7 +917,9 @@ async def chat_completion_stream_generator(
                             index=i,
                             delta=delta_message,
                             logprobs=logprobs,
-                            finish_reason=None)
+                            finish_reason=None,
+                            token_ids=(as_list(output.token_ids)
+                                       if request.return_token_ids else None))
 
                     # if the model is finished generating
                     else:
@@ -973,7 +980,9 @@ async def chat_completion_stream_generator(
                             logprobs=logprobs,
                             finish_reason=output.finish_reason
                             if not auto_tools_called else "tool_calls",
-                            stop_reason=output.stop_reason)
+                            stop_reason=output.stop_reason,
+                            token_ids=(as_list(output.token_ids)
+                                       if request.return_token_ids else None))
 
                         finish_reason_sent[i] = True
 
@@ -1260,7 +1269,10 @@ async def chat_completion_full_generator(
                 logprobs=logprobs,
                 finish_reason="tool_calls" if auto_tools_called else
                 output.finish_reason if output.finish_reason else "stop",
-                stop_reason=output.stop_reason)
+                stop_reason=output.stop_reason,
+                token_ids=(as_list(output.token_ids)
+                           if request.return_token_ids else None),
+            )
 
             choices.append(choice_data)
 
@@ -1301,6 +1313,8 @@ async def chat_completion_full_generator(
             choices=choices,
             usage=usage,
             prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
+            prompt_token_ids=(final_res.prompt_token_ids
+                              if request.return_token_ids else None),
             kv_transfer_params=final_res.kv_transfer_params,
         )
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 22c6b6250394..a0ce65409403 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -42,7 +42,7 @@
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import merge_async_iterators
+from vllm.utils import as_list, merge_async_iterators
 
 logger = init_logger(__name__)
 
@@ -365,6 +365,11 @@ async def completion_stream_generator(
                 for output in res.outputs:
                     i = output.index + prompt_idx * num_choices
 
+                    # Useful when request.return_token_ids is True
+                    # Returning prompt token IDs shares the same logic
+                    # with the echo implementation.
+                    prompt_token_ids_to_return: Optional[list[int]] = None
+
                     assert request.max_tokens is not None
                     if request.echo and not has_echoed[i]:
                         assert prompt_token_ids is not None
@@ -385,6 +390,7 @@ async def completion_stream_generator(
                                 *(prompt_logprobs or []),
                                 *(output.logprobs or []),
                             ]
+                        prompt_token_ids_to_return = prompt_token_ids
                         has_echoed[i] = True
                     else:
                         # return just the delta
@@ -392,6 +398,12 @@ async def completion_stream_generator(
                         delta_token_ids = output.token_ids
                         out_logprobs = output.logprobs
 
+                        # has_echoed[i] is reused here to indicate whether
+                        # we have already returned the prompt token IDs.
+                        if not has_echoed[i]:
+                            prompt_token_ids_to_return = prompt_token_ids
+                            has_echoed[i] = True
+
                         if (not delta_text and not delta_token_ids
                                 and not previous_num_tokens[i]):
                             # Chunked prefill case, don't return empty chunks
@@ -428,6 +440,9 @@ async def completion_stream_generator(
                                 logprobs=logprobs,
                                 finish_reason=finish_reason,
                                 stop_reason=stop_reason,
+                                prompt_token_ids=prompt_token_ids_to_return,
+                                token_ids=(as_list(output.token_ids) if
+                                           request.return_token_ids else None),
                             )
                         ],
                     )
@@ -548,6 +563,10 @@ def request_output_to_completion_response(
                     finish_reason=output.finish_reason,
                     stop_reason=output.stop_reason,
                     prompt_logprobs=final_res.prompt_logprobs,
+                    prompt_token_ids=(prompt_token_ids
+                                      if request.return_token_ids else None),
+                    token_ids=(as_list(output.token_ids)
+                               if request.return_token_ids else None),
                 )
                 choices.append(choice_data)
 

From 19f6bb7192b4947e9f3c343bbde66956e0305ee8 Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Tue, 19 Aug 2025 10:02:02 -0700
Subject: [PATCH 198/233] Migrate LlavaOnevisionMultiInputs to TensorSchema
 (#21844)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/llava_onevision.py | 149 +++++++-----------
 1 file changed, 56 insertions(+), 93 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index babd72a4b782..42ab5e7c74d3 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Final, Literal, Optional, Protocol, TypedDict, Union
+from typing import Annotated, Final, Literal, Optional, Protocol, Union
 
 import torch
 import torch.nn as nn
@@ -11,7 +11,6 @@
                           LlavaOnevisionProcessor)
 from transformers.models.llava_onevision.modeling_llava_onevision import (
     get_anyres_image_grid_shape, unpad_image)
-from typing_extensions import NotRequired
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
@@ -23,6 +22,7 @@
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -38,44 +38,62 @@
 _MAX_FRAMES_PER_VIDEO = 16
 
 
-class LlavaOnevisionVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_videos"]
-    pixel_values_videos: Union[torch.Tensor, list[torch.Tensor]]
+class LlavaOnevisionVideoPixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_videos, num_frames, num_channels, height, width)`
-
-    Note that `num_videos` may be different for each batch, and 'num_frames'
-    may be different for each video, in which case the data is passed as a
-    list instead of a batched tensor.
+    Dimensions:
+        - bn: Batch size * number of videos
+        - f: Number of frames
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+
+        Note that `num_videos` may be different for each batch, and 'num_frames'
+        may be different for each video, in which case the data is passed as a
+        list instead of a batched tensor.
     """
+    type: Literal["pixel_values_videos"] = "pixel_values_videos"
 
+    pixel_values_videos: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "f", 3, "h", "w", dynamic_dims={"f"}),
+    ]
 
-class LlavaOnevisionImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    Shape:
-    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
 
-    Note that `num_patches` may be different per batch and image,
-    in which case the data is passed as a list instead of a batched tensor.
+class LlavaOnevisionImagePixelInputs(TensorSchema):
     """
-
-    image_sizes: NotRequired[torch.Tensor]
+    Dimensions:
+        - bn: Batch size * number of images
+        - np: Number of patches (1 + num_patches)
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+
+        Note that `num_patches` may be different per batch and image,
+        in which case the data is passed as a list instead of a batched tensor.
     """
-    Shape: `(batch_size * num_images, 2)`
+    type: Literal["pixel_values"] = "pixel_values"
 
-    This should be in `(height, width)` format.
-    """
+    pixel_values: Annotated[
+        Union[torch.Tensor, list[torch.Tensor]],
+        TensorShape("bn", "np", 3, "h", "w"),
+    ]
 
+    image_sizes: Annotated[Optional[torch.Tensor], TensorShape("bn", 2)]
 
-class LlavaOnevisionImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
 
-    `hidden_size` must match the hidden size of language model backbone.
+class LlavaOnevisionImageEmbeddingInputs(TensorSchema):
     """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size
+        - hs: Hidden size (must match language model backbone)
+    """
+    type: Literal["image_embeds"] = "image_embeds"
+
+    data: Annotated[
+        torch.Tensor,
+        TensorShape("bn", "ifs", "hs"),
+    ]
 
 
 LlavaOnevisionImageInputs = Union[LlavaOnevisionImagePixelInputs,
@@ -482,44 +500,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
 
-    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
-        expected_dims = (2, )
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    f"The expected shape of image sizes per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
-    def _validate_image_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[1:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaOnevisionImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -540,11 +520,12 @@ def _parse_and_validate_image_input(
 
             return LlavaOnevisionImagePixelInputs(
                 type="pixel_values",
-                pixel_values=self._validate_image_pixel_values(
-                    flatten_bn(pixel_values)),
-                image_sizes=self._validate_image_sizes(
-                    flatten_bn(image_sizes, concat=True)),
-            )
+                pixel_values=flatten_bn(pixel_values),
+                image_sizes=flatten_bn(image_sizes, concat=True),
+                resolve_bindings={
+                    "h": self.config.vision_config.image_size,
+                    "w": self.config.vision_config.image_size
+                })
 
         if image_embeds is not None:
             if not isinstance(image_embeds, torch.Tensor):
@@ -558,27 +539,6 @@ def _parse_and_validate_image_input(
 
         raise AssertionError("This line should be unreachable.")
 
-    def _validate_video_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[2:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_frames", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values in each video frame "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
     def _parse_and_validate_video_input(
             self,
             **kwargs: object) -> Optional[LlavaOnevisionVideoPixelInputs]:
@@ -600,7 +560,10 @@ def _parse_and_validate_video_input(
         return LlavaOnevisionVideoPixelInputs(
             type="pixel_values_videos",
             pixel_values_videos=flatten_bn(pixel_values_videos),
-        )
+            resolve_bindings={
+                "h": self.config.vision_config.image_size,
+                "w": self.config.vision_config.image_size
+            })
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}

From f857b8204641435433ce2df224bc721b4b342921 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 20 Aug 2025 01:06:17 +0800
Subject: [PATCH 199/233] [CI/Build] Update transformers to v4.55.2 (#23093)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 requirements/common.txt                         |  2 +-
 requirements/test.in                            |  2 +-
 requirements/test.txt                           |  2 +-
 .../models/multimodal/generation/test_mllama.py | 17 +++++++++--------
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 6bc71df24f0e..3c3ac0abf50f 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.55.0
+transformers >= 4.55.2
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/test.in b/requirements/test.in
index 6652bfdfe66c..7f141fe281d6 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.55.0
+transformers==4.55.2
 tokenizers==0.21.1
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.txt b/requirements/test.txt
index ff9886a31597..48eb09811bcc 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1139,7 +1139,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.55.0
+transformers==4.55.2
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py
index b413c4d6b366..1c32cc6d71c0 100644
--- a/tests/models/multimodal/generation/test_mllama.py
+++ b/tests/models/multimodal/generation/test_mllama.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+from packaging.version import Version
 from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
 from transformers import __version__ as TRANSFORMERS_VERSION
 
@@ -287,8 +288,8 @@ def clear_cache():
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
     "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
                                      model, sizes, dtype, max_tokens,
@@ -319,8 +320,8 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
     "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
                                      model, dtype, max_tokens, num_logprobs,
@@ -372,8 +373,8 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
     "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
                                    dtype, max_tokens, num_logprobs,
@@ -416,8 +417,8 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.skipif(
-    TRANSFORMERS_VERSION == "4.55.0",
-    reason="Transformers v4.55.0 has a regression issue on mllama, "
+    Version(TRANSFORMERS_VERSION) <= Version("4.55.2"),
+    reason="Transformers v4.55 has a regression issue on mllama, "
     "see: https://github.com/huggingface/transformers/pull/40083")
 def test_models_distributed(
     hf_runner,

From 8a0342f0ab2c4103bda49bfde7d7eaf335b18b10 Mon Sep 17 00:00:00 2001
From: Ruixiang Tan <tanruixiang0104@gmail.com>
Date: Wed, 20 Aug 2025 01:18:51 +0800
Subject: [PATCH 200/233] [Misc] Fix the benchmark's README and improve the
 error messages for the benchmark's argument checks (#22654)

Signed-off-by: tanruixiang <tanruixiang0104@gmail.com>
---
 benchmarks/README.md        | 3 +++
 vllm/benchmarks/datasets.py | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 1d715a193ea1..69d32e222819 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -194,6 +194,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
+  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -230,6 +231,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
+  --endpoint-type openai-chat \  
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -244,6 +246,7 @@ vllm bench serve \
 ```bash
 vllm bench serve \
   --backend openai-chat \
+  --endpoint-type openai-chat \  
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index b575e8b9e0a0..3532a083fb4a 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -740,10 +740,11 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                 "openai-chat",
                 "openai-audio",
         ]:
-            # multi-modal benchmark is only available on OpenAI Chat backend.
+            # multi-modal benchmark is only available on OpenAI Chat
+            # endpoint-type.
             raise ValueError(
                 "Multi-modal content is only supported on 'openai-chat' and "
-                "'openai-audio' backend.")
+                "'openai-audio' endpoint-type.")
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,

From 88c6bc080611d06bcc81b8cc7d5503e43865314d Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Tue, 19 Aug 2025 10:29:32 -0700
Subject: [PATCH 201/233] [Frontend] Add `/collective_rpc` API endpoint
 (#23075)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |  3 +-
 .../entrypoints/openai/test_collective_rpc.py | 88 +++++++++++++++++++
 vllm/engine/protocol.py                       |  8 ++
 vllm/entrypoints/openai/api_server.py         | 28 ++++++
 4 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 tests/entrypoints/openai/test_collective_rpc.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d4fcb91b11b0..265e6ad72a5f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -126,7 +126,8 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
+  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
   - pytest -v -s entrypoints/test_chat_utils.py
 
 - label: Distributed Tests (4 GPUs) # 10min
diff --git a/tests/entrypoints/openai/test_collective_rpc.py b/tests/entrypoints/openai/test_collective_rpc.py
new file mode 100644
index 000000000000..37c0b7a900ac
--- /dev/null
+++ b/tests/entrypoints/openai/test_collective_rpc.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+class TestWorkerExtension:
+
+    def get_model_name(self) -> str:
+        """Test non-pydantic return type."""
+        return MODEL_NAME
+
+    def echo_args_kwargs(self, *args, **kwargs) -> dict[str, Any]:
+        """Echo back both args and kwargs."""
+        return dict(
+            args=list(args),
+            kwargs=kwargs,
+            total_items=len(args) + len(kwargs),
+        )
+
+    def return_none(self, *args, **kwargs) -> None:
+        """Test method that does not return anything"""
+        return
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--worker-extension-cls",
+        "tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
+    ]
+    with RemoteOpenAIServer(
+            MODEL_NAME,
+            args,
+            env_dict={
+                "VLLM_SERVER_DEV_MODE": "1",
+                "CUDA_VISIBLE_DEVICES": "0"
+            },
+    ) as remote_server:
+        yield remote_server
+
+
+def test_get_model_name(server):
+    """Test basic response"""
+    response = requests.post(server.url_for("collective_rpc"),
+                             json={"method": "get_model_name"})
+    assert response.status_code == 200
+    results = response.json()
+    assert "results" in results
+    assert results["results"] == [MODEL_NAME]
+
+
+def test_return_none(server):
+    """Test return none"""
+    response = requests.post(server.url_for("collective_rpc"),
+                             json={"method": "return_none"})
+    assert response.status_code == 200
+    results = response.json()
+    assert results["results"] == [None]
+
+
+def test_echo_args_kwargs(server):
+    """Test args, kwargs, and dict response"""
+    args = ["arg1", "arg2"]
+    kwargs = {"key1": "value1", "key2": "value2"}
+    response = requests.post(server.url_for("collective_rpc"),
+                             json={
+                                 "method": "echo_args_kwargs",
+                                 "args": args,
+                                 "kwargs": kwargs
+                             })
+    assert response.status_code == 200
+    results = response.json()
+    result = results["results"][0]
+    assert result["args"] == args
+    assert result["kwargs"] == kwargs
+    assert result["total_items"] == len(args) + len(kwargs)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index c610fb5eae60..5e8ac9c0b398 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -329,3 +329,11 @@ async def scale_elastic_ep(self,
                                drain_timeout: int = 300) -> None:
         """Scale the engine"""
         raise NotImplementedError
+
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """Perform a collective RPC call to the given path."""
+        raise NotImplementedError
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index af86835a497d..765327da3b30 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1044,6 +1044,34 @@ async def is_sleeping(raw_request: Request):
         is_sleeping = await engine_client(raw_request).is_sleeping()
         return JSONResponse(content={"is_sleeping": is_sleeping})
 
+    @router.post("/collective_rpc")
+    async def collective_rpc(raw_request: Request):
+        try:
+            body = await raw_request.json()
+        except json.JSONDecodeError as e:
+            raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
+                                detail=f"JSON decode error: {e}") from e
+        method = body.get("method")
+        if method is None:
+            raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
+                                detail="Missing 'method' in request body")
+        # For security reason, only serialized string args/kwargs are passed.
+        # User-defined `method` is responsible for deseralization if needed.
+        args: list[str] = body.get("args", [])
+        kwargs: dict[str, str] = body.get("kwargs", {})
+        timeout: Optional[float] = body.get("timeout")
+        results = await engine_client(raw_request).collective_rpc(
+            method=method, timeout=timeout, args=tuple(args), kwargs=kwargs)
+        if results is None:
+            return Response(status_code=200)
+        response: list[Any] = []
+        for result in results:
+            if result is None or isinstance(result, (dict, list)):
+                response.append(result)
+            else:
+                response.append(str(result))
+        return JSONResponse(content={"results": response})
+
 
 @router.post("/scale_elastic_ep",
              dependencies=[Depends(validate_json_request)],

From e80cc3fefe3e794922c18621048a9db0a178a9f4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Aug 2025 10:33:47 -0700
Subject: [PATCH 202/233] [Misc] Enable yapf for FlashInfer backend (#23193)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 37 +++++++++++++++---------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index c56e721dff8c..44f95c768686 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -36,6 +36,7 @@
                                               get_per_layer_parameters,
                                               infer_global_hyperparameters,
                                               split_decodes_and_prefills)
+# yapf: enable
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
@@ -541,12 +542,22 @@ def build(self,
         if cache_dtype.startswith("fp8") and enable_fusion:
             q_dtype = kv_cache_dtype
 
-        prefill_use_trtllm = use_trtllm_attention(
-            num_qo_heads, num_kv_heads, num_prefill_tokens, max_seq_len,
-            cache_dtype, q_dtype, is_prefill=True, has_sinks=has_sinks)
-        decode_use_trtllm = use_trtllm_attention(
-            num_qo_heads, num_kv_heads, num_decode_tokens, max_seq_len,
-            cache_dtype, q_dtype, is_prefill=False, has_sinks=has_sinks)
+        prefill_use_trtllm = use_trtllm_attention(num_qo_heads,
+                                                  num_kv_heads,
+                                                  num_prefill_tokens,
+                                                  max_seq_len,
+                                                  cache_dtype,
+                                                  q_dtype,
+                                                  is_prefill=True,
+                                                  has_sinks=has_sinks)
+        decode_use_trtllm = use_trtllm_attention(num_qo_heads,
+                                                 num_kv_heads,
+                                                 num_decode_tokens,
+                                                 max_seq_len,
+                                                 cache_dtype,
+                                                 q_dtype,
+                                                 is_prefill=False,
+                                                 has_sinks=has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -654,19 +665,18 @@ def __init__(
                 raise ValueError(
                     "Sinks must have the same number of heads as the number of "
                     f"heads in the layer. Expected {num_heads}, but got "
-                    f"{sinks.shape[0]}."
-                )
+                    f"{sinks.shape[0]}.")
             self.sinks = sinks
 
-        self.support_trtllm_attn = (supports_trtllm_attention() and
-                                    num_heads % num_kv_heads == 0)
+        self.support_trtllm_attn = (supports_trtllm_attention()
+                                    and num_heads % num_kv_heads == 0)
         self.bmm1_scale: Optional[float] = None
         self.bmm2_scale: Optional[float] = None
 
     def fused_output_quant_supported(self, dtype: torch.dtype, static: bool,
                                      group_shape: GroupShape):
-        supported_quant_type = (dtype == FP8_DTYPE and static and
-                                group_shape == GroupShape.PER_TENSOR)
+        supported_quant_type = (dtype == FP8_DTYPE and static
+                                and group_shape == GroupShape.PER_TENSOR)
         return (self.support_trtllm_attn
                 and self.kv_cache_dtype.startswith("fp8")
                 and supported_quant_type)
@@ -731,7 +741,8 @@ def forward(
             # Insert FP8 quant for query
             num_tokens, num_heads, head_size = query.shape
             query, _ = ops.scaled_fp8_quant(
-                query.reshape((num_tokens, num_heads * head_size)).contiguous(),
+                query.reshape(
+                    (num_tokens, num_heads * head_size)).contiguous(),
                 layer._q_scale)
             query = query.reshape((num_tokens, num_heads, head_size))
 

From dbaa4b4eb33e54bafa06e1407f26e380a9b816cd Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Tue, 19 Aug 2025 14:00:51 -0400
Subject: [PATCH 203/233] [Bugfix] Fix accuracy issue when using flashinfer
 cutlass moe, TP=1 and modelopt. (#23125)

Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../fused_moe/flashinfer_cutlass_moe.py       | 49 ++++++++++
 vllm/model_executor/layers/fused_moe/layer.py |  2 +
 .../compressed_tensors_moe.py                 | 27 ++++++
 .../layers/quantization/modelopt.py           | 92 ++++++++++++-------
 4 files changed, 135 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 3fbe2a0bc69b..6a9c28b53cd8 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -7,6 +7,8 @@
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+    FlashInferCutlassMoEPrepareAndFinalize)
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP)
 from vllm.utils.flashinfer import (flashinfer_cutlass_fused_moe,
@@ -181,3 +183,50 @@ def apply(
             ep_rank=self.ep_rank,
             output=output,
         )
+
+
+def flashinfer_cutlass_moe_fp4(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    g1_alphas: torch.Tensor,
+    g2_alphas: torch.Tensor,
+    a1_gscale: torch.Tensor,
+    a2_gscale: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+
+    fused_experts = mk.FusedMoEModularKernel(
+        FlashInferCutlassMoEPrepareAndFinalize(use_dp=False,
+                                               a1_gscale=a1_gscale),
+        FlashInferExperts(
+            g1_alphas=g1_alphas,
+            g2_alphas=g2_alphas,
+            a1_gscale=a1_gscale,
+            a2_gscale=a2_gscale,
+            out_dtype=hidden_states.dtype,
+            quant_dtype="nvfp4",
+        ))
+
+    return fused_experts(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=inplace,
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 4924f1fadb3b..aa8ceda1bb25 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -198,6 +198,8 @@ def maybe_make_prepare_finalize(
         else:
             return None
 
+    # Note: init_prepare_finalize should only be called by
+    # prepare_communication_buffer_for_model.
     def init_prepare_finalize(self):
         assert self.moe is not None
         prepare_finalize = self.maybe_make_prepare_finalize(self.moe)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 42c43cbc03e5..8ca8249e694e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -388,6 +388,33 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
             )
 
+        elif self.allow_flashinfer:
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                flashinfer_cutlass_moe_fp4)
+
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            return flashinfer_cutlass_moe_fp4(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
+                g1_alphas=layer.g1_alphas,
+                g2_alphas=layer.g2_alphas,
+                a1_gscale=layer.w13_input_scale_quant,
+                a2_gscale=layer.w2_input_scale_quant,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+
         assert expert_map is None, ("Expert Parallelism / expert_map "
                                     "is currently not supported for "
                                     "CompressedTensorsW4A4MoeMethod.")
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index e0f462b36976..28f16d108834 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -966,22 +966,21 @@ def __init__(
                     f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
                     f" expected one of {allowed_backends}")
 
-        self.fused_experts: Optional[
-            mk.FusedMoEModularKernel] = None  # type: ignore[assignment]
-
     def maybe_make_prepare_finalize(
         self,
         moe: FusedMoEConfig,
     ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
-        if not self.allow_flashinfer:
-            return super().maybe_make_prepare_finalize(moe)
-
-        prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(
-            moe,
-            a1_gscale=self.layer.w13_input_scale_quant,
-        )
-        logger.debug_once("%s", prepare_finalize.__class__.__name__)
-        return prepare_finalize
+        if (self.allow_flashinfer and self.flashinfer_moe_backend
+                == FlashinferMoeBackend.CUTLASS):
+            prepare_finalize = (
+                build_flashinfer_fp4_cutlass_moe_prepare_finalize(
+                    moe,
+                    a1_gscale=self.layer.w13_input_scale_quant,
+                ))
+            logger.debug_once("%s", prepare_finalize.__class__.__name__)
+            return prepare_finalize
+
+        return super().maybe_make_prepare_finalize(moe)
 
     def select_gemm_impl(
         self,
@@ -1409,7 +1408,52 @@ def apply(
                 global_num_experts=global_num_experts,
                 expert_map=expert_map)
 
-        if self.fused_experts is None:
+        if self.fused_experts is not None:
+            assert self.allow_flashinfer and \
+               self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+
+            assert is_valid_flashinfer_cutlass_fused_moe(
+                x, layer.w13_weight, layer.w2_weight), (
+                    "Flashinfer CUTLASS Fused MoE not applicable!")
+
+            out = self.fused_experts(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        elif (self.allow_flashinfer
+              and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS):
+            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                flashinfer_cutlass_moe_fp4)
+
+            out = flashinfer_cutlass_moe_fp4(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                w1_scale=layer.w13_blockscale_swizzled,
+                w2_scale=layer.w2_blockscale_swizzled,
+                g1_alphas=layer.g1_alphas,
+                g2_alphas=layer.g2_alphas,
+                a1_gscale=layer.w13_input_scale_quant,
+                a2_gscale=layer.w2_input_scale_quant,
+                inplace=False,  # TODO(shuw): fix later, now output is high prec
+                activation=activation,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+            )
+        else:
             # If no modular kernel is provided, use cutlass_moe_fp4 for TP case
             # only (no EP).
             from vllm.model_executor.layers.fused_moe.cutlass_moe import (
@@ -1432,27 +1476,5 @@ def apply(
                 e=layer.w13_weight.shape[0],
                 expert_map=expert_map,
                 apply_router_weight_on_input=apply_router_weight_on_input)
-        else:
-            assert self.allow_flashinfer and \
-               self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-
-            assert is_valid_flashinfer_cutlass_fused_moe(
-                x, layer.w13_weight, layer.w2_weight), (
-                    "Flashinfer CUTLASS Fused MoE not applicable!")
-
-            out = self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                inplace=False,  # TODO(shuw): fix later, now output is high prec
-                activation=activation,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map,
-                w1_scale=layer.w13_blockscale_swizzled,
-                w2_scale=layer.w2_blockscale_swizzled,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-            )
 
         return out

From fc9dfb2a20061b8bd796daf497763786bcd66f81 Mon Sep 17 00:00:00 2001
From: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
Date: Tue, 19 Aug 2025 20:12:25 +0200
Subject: [PATCH 204/233] fix: use cache_salt for gpt-oss (#23186)

Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
---
 tests/entrypoints/openai/test_serving_chat.py | 4 +++-
 vllm/entrypoints/openai/serving_chat.py       | 5 +++++
 vllm/entrypoints/openai/serving_responses.py  | 5 +++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 8a7892cf6d6a..10879f0be83c 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -282,9 +282,11 @@ async def test_serving_chat_could_load_correct_generation_config():
     assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
 
 
+@pytest.mark.parametrize("model_type", ["gpt_oss", "any"])
 @pytest.mark.asyncio
-async def test_serving_chat_did_set_correct_cache_salt():
+async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_model_config = MockModelConfig()
+    mock_model_config.hf_config.model_type = model_type
 
     mock_engine = MagicMock(spec=MQLLMEngineClient)
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1789521afc84..d57868847eed 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1483,4 +1483,9 @@ def _make_request_with_harmony(
         # Render prompt token ids.
         prompt_token_ids = render_for_completion(messages)
         engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+
+        # Add cache_salt if provided in the request
+        if request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
         return messages, [prompt_token_ids], [engine_prompt]
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 86c16df40e69..1b30fa01ea91 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -408,6 +408,11 @@ def _make_request_with_harmony(
             request, prev_response)
         prompt_token_ids = render_for_completion(messages)
         engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
+
+        # Add cache_salt if provided in the request
+        if request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
         return messages, [prompt_token_ids], [engine_prompt]
 
     async def responses_full_generator(

From 62ae27d7306de210f52fc2726946c39ae1b3d16c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 19 Aug 2025 13:11:51 -0700
Subject: [PATCH 205/233] [Misc] Minor refactoring for FlashInfer backend
 (#23147)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 156 ++++++++++-------------
 1 file changed, 65 insertions(+), 91 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 44f95c768686..53fafbc4af91 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -10,8 +10,7 @@
 from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
                         MultiLevelCascadeAttentionWrapper)
-from flashinfer.decode import (_get_range_buf, get_seq_lens,
-                               trtllm_batch_decode_with_kv_cache)
+from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 
 import vllm.envs as envs
@@ -142,19 +141,10 @@ class FlashInferMetadata:
     # The number of entries in the last page of each request in
     # the paged kv cache, shape: [batch_size] (CPU for plan)
     paged_kv_last_page_len_cpu: torch.Tensor
-    # The number of query/output heads
-    num_qo_heads: int
-    # The number of key/value heads
-    num_kv_heads: int
-    # The dimension of the attention heads
-    head_dim: int
-    # Block size of vllm
-    page_size: int
-    # The data type of the paged kv cache
-    kv_data_type: torch.dtype
     # The data type of the query
     q_data_type: torch.dtype
 
+    seq_lens_cpu: torch.Tensor
     slot_mapping: torch.Tensor
 
     # For flashinfer trtllm batch decode
@@ -185,10 +175,6 @@ class FlashInferMetadata:
     qo_indptr_gpu: Optional[torch.Tensor] = None
     paged_kv_indptr_gpu: Optional[torch.Tensor] = None
 
-    def __post_init__(self):
-        if self.head_dim is not None:
-            FlashInferBackend.validate_head_size(self.head_dim)
-
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
     cudagraph_support: ClassVar[AttentionCGSupport] = \
@@ -201,13 +187,14 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
         self.device = device
         self.vllm_config = vllm_config
         self.cache_config = vllm_config.cache_config
+        self.model_config = vllm_config.model_config
         self.kv_cache_spec = kv_cache_spec
         self._workspace_buffer = None
         self._prefill_wrapper = None  # Wrapper for prefill/append
         self._decode_wrapper = None  # Wrapper for decode (general shape)
 
         self.compilation_config = vllm_config.compilation_config
-        max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len,
+        max_num_pages_per_req = cdiv(self.model_config.max_model_len,
                                      self.kv_cache_spec.block_size)
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
@@ -221,6 +208,29 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
             self._decode_cudagraph_max_bs = min(
                 max_num_reqs, self.compilation_config.max_capture_size)
 
+        self.num_qo_heads = self.model_config.get_num_attention_heads(
+            self.vllm_config.parallel_config)
+        self.num_kv_heads = self.kv_cache_spec.num_kv_heads
+        self.head_dim = self.kv_cache_spec.head_size
+        FlashInferBackend.validate_head_size(self.head_dim)
+        self.page_size = self.kv_cache_spec.block_size
+
+        self.enable_fusion = (
+            self.compilation_config.pass_config.enable_attn_fusion)
+        self.q_data_type = self.model_config.dtype
+        self.cache_dtype = self.cache_config.cache_dtype
+        if self.cache_dtype.startswith("fp8"):
+            self.kv_cache_dtype = (
+                FlashInferBackend.get_fp8_dtype_for_flashinfer(
+                    self.cache_dtype))
+            # Insert FP8 quant for query if FP8 kv cache and attn fusion enabled
+            if self.enable_fusion:
+                self.q_data_type = self.kv_cache_dtype
+        else:
+            self.kv_cache_dtype = self.kv_cache_spec.dtype
+        self.use_tensor_cores = (envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or
+                                 (self.num_qo_heads // self.num_kv_heads > 4))
+
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
@@ -282,14 +292,6 @@ def _get_decode_wrapper(self,
             decode_wrapper = self._decode_wrapper
 
         if decode_wrapper is None:
-            num_qo_heads = (
-                self.vllm_config.model_config.get_num_attention_heads(
-                    self.vllm_config.parallel_config))
-            num_kv_heads = self.vllm_config.model_config.get_num_kv_heads(
-                self.vllm_config.parallel_config)
-            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
-                num_qo_heads // num_kv_heads > 4)
-
             if use_cudagraph:
                 paged_kv_indptr = self.paged_kv_indptr[:batch_size + 1]
                 paged_kv_indices = self.paged_kv_indices
@@ -306,7 +308,7 @@ def _get_decode_wrapper(self,
                 paged_kv_indptr_buffer=paged_kv_indptr,
                 paged_kv_indices_buffer=paged_kv_indices,
                 paged_kv_last_page_len_buffer=paged_kv_last_page_len,
-                use_tensor_cores=use_tensor_cores)
+                use_tensor_cores=self.use_tensor_cores)
 
             # save the decode wrapper
             if use_cudagraph:
@@ -342,16 +344,16 @@ def _plan(self, attn_metadata: FlashInferMetadata):
                     attn_metadata.shared_kv_last_page_len_cpu,
                     attn_metadata.paged_kv_last_page_len_cpu
                 ],
-                attn_metadata.num_qo_heads,
-                attn_metadata.num_kv_heads,
-                attn_metadata.head_dim,
-                attn_metadata.page_size,
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                self.page_size,
                 causal=True,
                 sm_scale=self.global_hyperparameters.sm_scale,
                 window_left=self.global_hyperparameters.window_left,
                 logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
-                q_data_type=attn_metadata.q_data_type,
-                kv_data_type=attn_metadata.kv_data_type,
+                q_data_type=self.q_data_type,
+                kv_data_type=self.kv_cache_dtype,
             )
         else:
             # Regular attention (common case).
@@ -383,17 +385,17 @@ def _plan(self, attn_metadata: FlashInferMetadata):
                         attn_metadata.paged_kv_indices,
                         attn_metadata.
                         paged_kv_last_page_len_cpu[prefill_start:],
-                        attn_metadata.num_qo_heads,
-                        attn_metadata.num_kv_heads,
-                        attn_metadata.head_dim,
-                        attn_metadata.page_size,
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
                         causal=True,
                         sm_scale=self.global_hyperparameters.sm_scale,
                         window_left=self.global_hyperparameters.window_left,
                         logits_soft_cap=self.global_hyperparameters.
                         logits_soft_cap,
-                        q_data_type=attn_metadata.q_data_type,
-                        kv_data_type=attn_metadata.kv_data_type,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
                     )
                 else:
                     attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device)
@@ -435,18 +437,19 @@ def _plan(self, attn_metadata: FlashInferMetadata):
                         self.paged_kv_indptr_cpu[:num_input_tokens + 1],
                         attn_metadata.paged_kv_indices,
                         self.paged_kv_last_page_len_cpu[:num_input_tokens],
-                        attn_metadata.num_qo_heads,
-                        attn_metadata.num_kv_heads,
-                        attn_metadata.head_dim,
-                        attn_metadata.page_size,
+                        attn_metadata.seq_lens_cpu[:num_input_tokens],
+                        self.num_qo_heads,
+                        self.num_kv_heads,
+                        self.head_dim,
+                        self.page_size,
                         # Disable flashinfer's pos encoding and use vllm's rope.
                         pos_encoding_mode="NONE",
                         sm_scale=self.global_hyperparameters.sm_scale,
                         window_left=self.global_hyperparameters.window_left,
                         logits_soft_cap=self.global_hyperparameters.
                         logits_soft_cap,
-                        q_data_type=attn_metadata.q_data_type,
-                        kv_data_type=attn_metadata.kv_data_type,
+                        q_data_type=self.q_data_type,
+                        kv_data_type=self.kv_cache_dtype,
                     )
 
     def build(self,
@@ -458,9 +461,9 @@ def build(self,
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\
             split_decodes_and_prefills(common_attn_metadata)
 
-        page_size = self.kv_cache_spec.block_size
+        page_size = self.page_size
         max_q_len = common_attn_metadata.max_query_len
-        max_seq_len = common_attn_metadata.seq_lens_cpu.max()
+        max_seq_len = common_attn_metadata.seq_lens_cpu.max().item()
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
@@ -495,7 +498,7 @@ def build(self,
             shared_kv_page_indices_cpu = None
             shared_kv_last_page_len_cpu = None
 
-        max_num_blocks = block_table_bounds_cpu.max()
+        max_num_blocks = block_table_bounds_cpu.max().item()
         block_table_bounds = block_table_bounds_cpu.to(self.device,
                                                        non_blocking=True)
         mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0)
@@ -520,42 +523,23 @@ def build(self,
                     paged_kv_last_page_len_cpu,
                     out=self.paged_kv_last_page_len_cpu[:num_reqs])
 
-        cache_dtype = self.cache_config.cache_dtype
-        if cache_dtype.startswith("fp8"):
-            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                cache_dtype)
-        else:
-            kv_cache_dtype = self.kv_cache_spec.dtype
-
-        config = self.vllm_config
-        num_qo_heads = config.model_config.get_num_attention_heads(
-            config.parallel_config)
-        num_kv_heads = self.kv_cache_spec.num_kv_heads
-        head_dim = self.kv_cache_spec.head_size
-
         # Check if any layer uses sinks (requires TRTLLM attention)
         has_sinks = self.global_hyperparameters.has_sinks
 
-        # Insert FP8 quant for query if FP8 kv cache and attn fusion enabled
-        q_dtype = config.model_config.dtype
-        enable_fusion = config.compilation_config.pass_config.enable_attn_fusion
-        if cache_dtype.startswith("fp8") and enable_fusion:
-            q_dtype = kv_cache_dtype
-
-        prefill_use_trtllm = use_trtllm_attention(num_qo_heads,
-                                                  num_kv_heads,
+        prefill_use_trtllm = use_trtllm_attention(self.num_qo_heads,
+                                                  self.num_kv_heads,
                                                   num_prefill_tokens,
                                                   max_seq_len,
-                                                  cache_dtype,
-                                                  q_dtype,
+                                                  self.cache_dtype,
+                                                  self.q_data_type,
                                                   is_prefill=True,
                                                   has_sinks=has_sinks)
-        decode_use_trtllm = use_trtllm_attention(num_qo_heads,
-                                                 num_kv_heads,
+        decode_use_trtllm = use_trtllm_attention(self.num_qo_heads,
+                                                 self.num_kv_heads,
                                                  num_decode_tokens,
                                                  max_seq_len,
-                                                 cache_dtype,
-                                                 q_dtype,
+                                                 self.cache_dtype,
+                                                 self.q_data_type,
                                                  is_prefill=False,
                                                  has_sinks=has_sinks)
 
@@ -566,12 +550,8 @@ def build(self,
             paged_kv_indices=paged_kv_indices,
             paged_kv_last_page_len_cpu=self.
             paged_kv_last_page_len_cpu[:num_reqs],
-            num_qo_heads=num_qo_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_dim,
-            page_size=page_size,
-            kv_data_type=kv_cache_dtype,
-            q_data_type=q_dtype,
+            q_data_type=self.q_data_type,
+            seq_lens_cpu=seq_lens_cpu,
             slot_mapping=common_attn_metadata.slot_mapping,
             max_q_len=max_q_len,
             max_seq_len=max_seq_len,
@@ -910,6 +890,7 @@ def fast_plan_decode(
     indptr_cpu: torch.Tensor,
     indices: torch.Tensor,
     last_page_len_cpu: torch.Tensor,
+    seq_lens_cpu: torch.Tensor,
     num_qo_heads: int,
     num_kv_heads: int,
     head_dim: int,
@@ -987,9 +968,6 @@ def fast_plan_decode(
     kv_data_type = getattr(torch, kv_data_type) if isinstance(
         kv_data_type, str) else kv_data_type
 
-    if self.use_tensor_cores:
-        qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
-
     if batch_size != self._fixed_batch_size:
         raise ValueError(
             "The batch size should be fixed in cudagraph mode, the runtime "
@@ -1006,12 +984,8 @@ def fast_plan_decode(
     self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu,
                                            non_blocking=True)
 
-    indptr_host = indptr_cpu
-    last_page_len_host = last_page_len_cpu
-
     if self.use_tensor_cores:
-        kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host,
-                                        page_size)
+        qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
         try:
             # Make sure we pass exactly 15 arguments for tensor core version
@@ -1020,8 +994,8 @@ def fast_plan_decode(
                 self._int_workspace_buffer,
                 self._pin_memory_int_workspace_buffer,
                 qo_indptr_host,
-                indptr_host,
-                kv_lens_arr_host,
+                indptr_cpu,
+                seq_lens_cpu,
                 batch_size,  # total_num_rows
                 batch_size,
                 num_qo_heads,
@@ -1041,7 +1015,7 @@ def fast_plan_decode(
                 self._float_workspace_buffer,
                 self._int_workspace_buffer,
                 self._pin_memory_int_workspace_buffer,
-                indptr_host,
+                indptr_cpu,
                 batch_size,
                 num_qo_heads,
                 num_kv_heads,

From 6013d5d6d4b9148bfcdb1b31e481b12f37509e8e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 19 Aug 2025 16:49:34 -0400
Subject: [PATCH 206/233] [CI/Build] Add support for Python 3.13 (#13164)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 CMakeLists.txt                     |  2 +-
 docs/getting_started/quickstart.md |  2 +-
 pyproject.toml                     |  3 ++-
 vllm/config/__init__.py            | 12 +++++++++++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34386d670ac7..bcbd1b52a06c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12", "3.13")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index f83380766646..2af26626d207 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -8,7 +8,7 @@ This guide will help you quickly get started with vLLM to perform:
 ## Prerequisites
 
 - OS: Linux
-- Python: 3.9 -- 3.12
+- Python: 3.9 -- 3.13
 
 ## Installation
 
diff --git a/pyproject.toml b/pyproject.toml
index 03a32ac0ba3d..013f2a6cd59e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,13 +24,14 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Intended Audience :: Developers",
     "Intended Audience :: Information Technology",
     "Intended Audience :: Science/Research",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9,<3.13"
+requires-python = ">=3.9,<3.14"
 dynamic = [ "version", "dependencies", "optional-dependencies"]
 
 [project.urls]
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index cd2be212c23d..56a749789b6a 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -191,7 +191,17 @@ def pairwise(iterable):
             yield a, b
             a = b
 
-    cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+    try:
+        cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
+    except (OSError, KeyError, TypeError):
+        # HACK: Python 3.13+ workaround - set missing __firstlineno__
+        # Workaround can be removed after we upgrade to pydantic==2.12.0
+        with open(inspect.getfile(cls)) as f:
+            for i, line in enumerate(f):
+                if f"class {cls.__name__}" in line and ":" in line:
+                    cls.__firstlineno__ = i + 1
+                    break
+        cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0]
 
     if not isinstance(cls_node, ast.ClassDef):
         raise TypeError("Given object was not a class.")

From 0e5d3521253be7820e02d67ed0494bbc49ecf5c3 Mon Sep 17 00:00:00 2001
From: amirkl94 <203507526+amirkl94@users.noreply.github.com>
Date: Wed, 20 Aug 2025 01:01:53 +0300
Subject: [PATCH 207/233] [NVIDIA] Add SM100 Flashinfer Cutlass MoE fp8 backend
 (#22357)

Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 tests/kernels/moe/test_flashinfer.py          | 248 ++++++++++++++++++
 .../fused_moe/flashinfer_cutlass_moe.py       |  55 ++--
 .../model_executor/layers/quantization/fp8.py | 215 +++++++++------
 .../layers/quantization/modelopt.py           | 118 ++++++---
 .../quantization/utils/flashinfer_utils.py    | 112 ++++++++
 6 files changed, 612 insertions(+), 138 deletions(-)
 create mode 100644 tests/kernels/moe/test_flashinfer.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 265e6ad72a5f..781b8e0fa009 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -630,6 +630,7 @@ steps:
   - vllm/model_executor/layers/fused_moe/cutlass_moe.py
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/compilation/fusion.py
   - vllm/compilation/fusion_attn.py
@@ -650,6 +651,7 @@ steps:
     # Fusion
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
new file mode 100644
index 000000000000..52a3d2ca3b42
--- /dev/null
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    apply_flashinfer_per_tensor_scale_fp8, flashinfer_cutlass_moe_fp8,
+    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
+    swap_w13_to_w31)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    input_to_float8)
+from vllm.model_executor.models.llama4 import Llama4MoE
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+if not has_flashinfer_cutlass_fused_moe(
+) or not current_platform.has_device_capability(100):
+    pytest.skip("Requires flashinfer_cutlass_fused_moe and nvfp4 support",
+                allow_module_level=True)
+
+NUM_EXPERTS = [16]
+TOP_KS = [1]
+
+MNK_FACTORS = [
+    (256, 8192, 5120),
+    (256, 4096, 5120),
+    (127, 8192, 5120),
+    (127, 4096, 5120),
+    (10, 8192, 5120),
+    (10, 4096, 5120),
+    (1, 8192, 5120),
+    (1, 4096, 5120),
+]
+
+vllm_config = VllmConfig(parallel_config=ParallelConfig(
+    pipeline_parallel_size=1))
+vllm_config.scheduler_config.max_num_seqs = 128
+vllm_config.scheduler_config.max_model_len = 8192
+
+
+def quant_fp8_per_tensor_batches(a):
+    num_batches = a.size(0)
+    a_quant = []
+    a_scales = []
+
+    for i in range(num_batches):
+        a_fp8, a_global_sf = input_to_float8(a[i])
+        a_global_sf = 1.0 / a_global_sf
+        a_quant.append(a_fp8)
+        a_scales.append(a_global_sf)
+
+    result_a_quant = torch.stack(a_quant)
+    result_a_scales = torch.stack(a_scales)
+
+    return result_a_quant, result_a_scales
+
+
+@dataclass
+class TestData:
+    hidden_states: torch.Tensor
+    w13_quantized: torch.Tensor
+    w2_quantized: torch.Tensor
+    a1_scale: torch.Tensor
+    a2_scale: torch.Tensor
+    w13_weight_scale: torch.Tensor
+    w2_weight_scale: torch.Tensor
+    layer: torch.nn.Module
+
+    @staticmethod
+    def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
+                              reorder: bool) -> "TestData":
+        hidden_states = torch.randn(
+            (m, k), device="cuda", dtype=torch.bfloat16) / 10
+        w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
+        w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
+
+        # Scale to fp8
+        _, a1_scale = input_to_float8(hidden_states)
+        a1_scale = 1.0 / a1_scale
+        a2_scale = torch.scalar_tensor(1.0).to(device="cuda").to(
+            dtype=torch.float32)
+        w13_quantized, w13_weight_scale = quant_fp8_per_tensor_batches(w13)
+        w2_quantized, w2_weight_scale = quant_fp8_per_tensor_batches(w2)
+
+        layer = torch.nn.Module()
+        layer.w13_weight = w13_quantized.clone()
+        layer.w2_weight = w2_quantized.clone()
+        layer.w13_input_scale = a1_scale
+        layer.w2_input_scale = a2_scale
+        layer.w13_weight_scale = w13_weight_scale
+        layer.w2_weight_scale = w2_weight_scale
+
+        register_moe_scaling_factors(layer)
+
+        # flashinfer expects swapped rows for w13
+        layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
+        if reorder:
+            rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
+                                              layer.w2_weight)
+        layer.custom_routing_function = Llama4MoE.custom_routing_function
+        layer.intermediate_size_per_partition = n
+        layer.ep_rank = 0
+        layer.local_num_experts = e
+
+        return TestData(
+            hidden_states=hidden_states,
+            w13_quantized=w13_quantized,
+            w2_quantized=w2_quantized,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            w13_weight_scale=w13_weight_scale,
+            w2_weight_scale=w2_weight_scale,
+            layer=layer,
+        )
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+def test_flashinfer_per_tensor_moe_fp8_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    monkeypatch,
+):
+    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
+    with set_current_vllm_config(vllm_config):
+        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=td.hidden_states,
+            router_logits=score,
+            use_grouped_topk=False,
+            top_k=topk,
+            renormalize=False,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+            scoring_func="softmax")
+
+        output = fused_experts(
+            td.hidden_states,
+            td.w13_quantized,
+            td.w2_quantized,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=False,
+            activation="silu",
+            use_fp8_w8a8=True,
+            per_channel_quant=False,
+            global_num_experts=e,
+            expert_map=None,
+            w1_scale=td.w13_weight_scale,
+            w2_scale=td.w2_weight_scale,
+            a1_scale=td.a1_scale,
+            a2_scale=td.a2_scale,
+            apply_router_weight_on_input=True,
+        )
+
+        flashinfer_output = apply_flashinfer_per_tensor_scale_fp8(
+            layer=td.layer,
+            hidden_states=td.hidden_states,
+            router_logits=score,
+            routing_bias=None,
+            global_num_experts=e,
+            top_k=topk,
+            num_expert_group=None,
+            topk_group=None,
+            apply_router_weight_on_input=True)
+
+        torch.testing.assert_close(output,
+                                   flashinfer_output,
+                                   atol=5.5e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.skip(
+    "Requires flashinfer version that contains https://github.com/flashinfer-ai/flashinfer/pull/1472"
+)
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+def test_flashinfer_cutlass_moe_fp8_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    monkeypatch,
+):
+    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
+    with set_current_vllm_config(vllm_config):
+        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
+
+        score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=td.hidden_states,
+            router_logits=score,
+            use_grouped_topk=False,
+            top_k=topk,
+            renormalize=False,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+            scoring_func="softmax")
+
+        output = fused_experts(
+            td.hidden_states,
+            td.w13_quantized,
+            td.w2_quantized,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=False,
+            activation="silu",
+            use_fp8_w8a8=True,
+            per_channel_quant=False,
+            global_num_experts=e,
+            expert_map=None,
+            w1_scale=td.w13_weight_scale,
+            w2_scale=td.w2_weight_scale,
+            a1_scale=td.a1_scale,
+            a2_scale=td.a2_scale,
+            apply_router_weight_on_input=True,
+        )
+
+        td.layer.dp_size = 1
+
+        flashinfer_cutlass_output = flashinfer_cutlass_moe_fp8(
+            td.hidden_states,
+            td.layer,
+            topk_weights,
+            topk_ids,
+            activation="silu",
+            global_num_experts=e,
+            expert_map=None,
+            apply_router_weight_on_input=True,
+        )
+
+        torch.testing.assert_close(output,
+                                   flashinfer_cutlass_output,
+                                   atol=5.5e-2,
+                                   rtol=1e-2)
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 6a9c28b53cd8..feab3f74cac5 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -61,8 +61,8 @@ def __init__(
                 per_act_token_quant=False,
                 block_shape=None,
             ))
-        assert quant_dtype == "nvfp4", ("Only nvfp4 quantization is "
-                                        "currently supported.")
+        assert quant_dtype in ("nvfp4", torch.float8_e4m3fn), (
+            "Only nvfp4,fp8 quantization are currently supported.")
         self.ep_rank = ep_rank
         self.ep_size = ep_size
         self.tp_rank = tp_rank
@@ -122,7 +122,8 @@ def workspace_shapes(
         """
         aq_m, aq_n = aq.shape
         workspace2 = ()
-        output_shape = (aq_m, aq_n * 2)
+        output_shape = (aq_m, aq_n * 2) if self.quant_dtype != \
+            torch.float8_e4m3fn else (aq_m, aq_n)
         workspace_dtype = a.dtype
         workspace1 = output_shape
         # The workspace is determined by `aq`, since it comes after any
@@ -151,29 +152,39 @@ def apply(
         expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
         apply_router_weight_on_input: Optional[bool],
     ):
-        # Flashinfer CUTLASS kernel takes scalar global scales,
-        # min because inv_scale.
-
-        # Ensure w1_scale and w2_scale are not None before calling view
-        assert w1_scale is not None and w2_scale is not None, (
-            "w1_scale and w2_scale must not "
-            "be None for FlashInferExperts")
-
-        quant_scales = [
-            self.a1_gscale,
-            w1_scale.view(torch.int32),
-            self.g1_alphas,
-            self.a2_gscale,
-            w2_scale.view(torch.int32),
-            self.g2_alphas,
-        ]
+        if self.quant_dtype == torch.float8_e4m3fn:
+            quant_scales = [
+                self.g1_alphas, self.a2_gscale, self.g2_alphas, self.a1_gscale
+            ]
+
+            a1q_scale = None  # not passing input_sf in fp8
+            fc1_expert_weights = w1
+            fc2_expert_weights = w2
+        else:
+            # Ensure w1_scale and w2_scale are not None before calling view
+            assert w1_scale is not None and w2_scale is not None, (
+                "w1_scale and w2_scale must not "
+                "be None for FlashInferExperts")
+            # Flashinfer CUTLASS kernel takes scalar global scales,
+            # min because inv_scale.
+            quant_scales = [
+                self.a1_gscale,
+                w1_scale.view(torch.int32),
+                self.g1_alphas,
+                self.a2_gscale,
+                w2_scale.view(torch.int32),
+                self.g2_alphas,
+            ]
+            # FlashInfer API requires weight to be long for nvfp4
+            fc1_expert_weights = w1.view(torch.long)
+            fc2_expert_weights = w2.view(torch.long)
+
         _ = flashinfer_cutlass_fused_moe(
             input=hidden_states,
             token_selected_experts=topk_ids.to(torch.int),
             token_final_scales=topk_weights,
-            # FlashInfer API requires weight to be long for nvfp4
-            fc1_expert_weights=w1.view(torch.long),
-            fc2_expert_weights=w2.view(torch.long),
+            fc1_expert_weights=fc1_expert_weights,
+            fc2_expert_weights=fc2_expert_weights,
             output_dtype=self.out_dtype,
             quant_scales=quant_scales,
             input_sf=a1q_scale,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f07be0855492..7c447c2a5348 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -9,6 +9,7 @@
 from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -23,8 +24,11 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
-    rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
+    FlashinferMoeBackend, apply_flashinfer_per_tensor_scale_fp8,
+    build_flashinfer_fp8_cutlass_moe_prepare_finalize,
+    flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend,
+    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
+    select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     get_col_major_tma_aligned_tensor, requant_weight_ue8m0_inplace)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
@@ -145,7 +149,7 @@ def get_quant_method(self, layer: torch.nn.Module,
                 return UnquantizedLinearMethod()
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return Fp8MoEMethod(self, layer.moe_config)
+            return Fp8MoEMethod(self, layer)
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
@@ -482,16 +486,20 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         quant_config: The quantization config.
     """
 
-    def __init__(self, quant_config: Fp8Config, moe: FusedMoEConfig):
-        super().__init__(moe)
+    def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
+        super().__init__(layer.moe_config)
+        self.layer = layer
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
 
-        self.flashinfer_moe_enabled = False
+        self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
+        self.fused_experts: Optional[
+            mk.FusedMoEModularKernel] = None  # type: ignore
         if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
-                "Using FlashInfer MoE FP8 kernels for Fp8MoEMethod.")
-            self.flashinfer_moe_enabled = True
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+            )
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
         self.use_marlin = (not current_platform.has_device_capability(89)
@@ -531,6 +539,20 @@ def __init__(self, quant_config: Fp8Config, moe: FusedMoEConfig):
                 "CutlassBlockScaledGroupedGemm not supported on the current "
                 "platform.")
 
+    def maybe_make_prepare_finalize(
+        self,
+        moe: FusedMoEConfig,
+    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+        if self.flashinfer_moe_backend != FlashinferMoeBackend.CUTLASS:
+            return super().maybe_make_prepare_finalize(moe)
+
+        prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+            moe,
+            layer=self.layer,
+        )
+        logger.debug_once("%s", prepare_finalize.__class__.__name__)
+        return prepare_finalize
+
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -678,7 +700,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     normalize_e4m3fn_to_e4m3fnuz(
                         layer.w2_weight, layer.w2_weight_scale_inv,
                         layer.w2_input_scale)
-            elif self.flashinfer_moe_enabled:
+            elif self.flashinfer_moe_backend is not None:
                 # NOTE: weights have to be swapped since the activation is
                 # applied on different half for flashinfer vs vllm
                 w13_weight = swap_w13_to_w31(layer.w13_weight.data)
@@ -686,9 +708,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     layer.w13_weight_scale_inv.data)
                 w2_weight = layer.w2_weight.data
                 w2_weight_scale_inv = layer.w2_weight_scale_inv.data
-                if not self.block_quant:
-                    register_moe_scaling_factors(layer)
-                    rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
             else:
                 w13_weight = layer.w13_weight.data
                 w13_weight_scale_inv = layer.w13_weight_scale_inv.data
@@ -834,6 +853,17 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
+            if self.flashinfer_moe_backend is not None:
+                # NOTE: weights have to be swapped since the activation is
+                # applied on different half for flashinfer vs vllm
+                assert not self.block_quant
+                register_moe_scaling_factors(layer)
+                w13_weight = swap_w13_to_w31(layer.w13_weight.data)
+                if self.flashinfer_moe_backend == \
+                    FlashinferMoeBackend.TENSORRT_LLM:
+                    rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight)
+                layer.w13_weight.data = w13_weight.data
+
         if self.use_marlin:
             prepare_moe_fp8_layer_for_marlin(layer, False)
             # Activations not quantized for marlin.
@@ -892,6 +922,13 @@ def select_gemm_impl(
                 per_act_token_quant=False,
                 allow_deep_gemm=self.allow_deep_gemm,
             )
+        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            experts = select_cutlass_fp8_gemm_impl(
+                moe,
+                self.layer,
+            )
+            logger.debug_once("Using %s", experts.__class__.__name__)
+            return experts
         else:
             logger.debug(
                 "TritonOrDeepGemmExperts(%s): block_size=%s, per_act_token=%s",
@@ -930,67 +967,12 @@ def apply(
             assert logical_to_physical_map is not None
             assert logical_replica_count is not None
             assert isinstance(layer, FusedMoE)
-        if not self.flashinfer_moe_enabled:
-            topk_weights, topk_ids = FusedMoE.select_experts(
-                hidden_states=x,
-                router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias,
-                indices_type=self.topk_indices_dtype,
-                enable_eplb=enable_eplb,
-                expert_map=expert_map,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-            )
 
-        if self.rocm_aiter_moe_enabled:
-            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
-                rocm_aiter_fused_experts)
-            return rocm_aiter_fused_experts(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                activation=activation,
-                use_fp8_w8a8=True,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                w1_scale=(layer.w13_weight_scale_inv
-                          if self.block_quant else layer.w13_weight_scale),
-                w2_scale=(layer.w2_weight_scale_inv
-                          if self.block_quant else layer.w2_weight_scale),
-                a1_scale=layer.w13_input_scale,
-                a2_scale=layer.w2_input_scale,
-                block_shape=self.quant_config.weight_block_size,
-                expert_map=expert_map)
-        elif self.use_marlin:
-            assert activation == "silu", (
-                f"{activation} not supported for Marlin MoE.")
-            return torch.ops.vllm.fused_marlin_moe(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                None,
-                None,
-                layer.w13_weight_scale,
-                layer.w2_weight_scale,
-                router_logits,
-                topk_weights,
-                topk_ids,
-                quant_type_id=scalar_types.float8_e4m3fn.id,
-                apply_router_weight_on_input=apply_router_weight_on_input,
-                global_num_experts=global_num_experts,
-                expert_map=expert_map)
-        elif self.flashinfer_moe_enabled:
-            assert activation == 'silu'
-            assert scoring_func == 'sigmoid'
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            assert activation == 'silu', (
+                f"Expected 'silu' activation but got {activation}")
+            assert scoring_func == 'sigmoid', (
+                f"Expected 'sigmoid' scoring func but got {scoring_func}")
             if self.block_quant:
                 assert (renormalize and use_grouped_topk
                         and custom_routing_function is None)
@@ -1026,25 +1008,96 @@ def apply(
                     num_expert_group=num_expert_group,
                     topk_group=topk_group,
                     apply_router_weight_on_input=apply_router_weight_on_input)
-        elif self.fused_experts is not None:
-            return self.fused_experts(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+        )
+
+        if self.rocm_aiter_moe_enabled:
+            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
+                rocm_aiter_fused_experts)
+            return rocm_aiter_fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
                 topk_weights=topk_weights,
                 topk_ids=topk_ids,
-                inplace=True,
                 activation=activation,
-                global_num_experts=global_num_experts,
+                use_fp8_w8a8=True,
                 apply_router_weight_on_input=apply_router_weight_on_input,
-                expert_map=expert_map,
                 w1_scale=(layer.w13_weight_scale_inv
                           if self.block_quant else layer.w13_weight_scale),
                 w2_scale=(layer.w2_weight_scale_inv
                           if self.block_quant else layer.w2_weight_scale),
                 a1_scale=layer.w13_input_scale,
                 a2_scale=layer.w2_input_scale,
-            )
+                block_shape=self.quant_config.weight_block_size,
+                expert_map=expert_map)
+        elif self.use_marlin:
+            assert activation == "silu", (
+                f"{activation} not supported for Marlin MoE.")
+            return torch.ops.vllm.fused_marlin_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                None,
+                None,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                router_logits,
+                topk_weights,
+                topk_ids,
+                quant_type_id=scalar_types.float8_e4m3fn.id,
+                apply_router_weight_on_input=apply_router_weight_on_input,
+                global_num_experts=global_num_experts,
+                expert_map=expert_map)
+        elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            assert self.block_quant is None
+            assert (not renormalize and custom_routing_function is not None)
+            assert activation == 'silu', (
+                f"Expected 'silu' activation but got {activation}")
+            assert scoring_func == 'sigmoid', (
+                f"Expected 'sigmoid' scoring func but got {scoring_func}")
+            if self.fused_experts is not None:
+                return self.fused_experts(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    inplace=False,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                )
+            else:
+                return flashinfer_cutlass_moe_fp8(
+                    x,
+                    layer,
+                    topk_weights,
+                    topk_ids,
+                    inplace=False,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                )
         else:
             from vllm.model_executor.layers.fused_moe import fused_experts
             return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 28f16d108834..046234057f04 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from enum import Enum
 from typing import Any, Callable, Optional, Union
 
 import torch
@@ -27,8 +26,11 @@
     build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl)
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
-    apply_flashinfer_per_tensor_scale_fp8, register_moe_scaling_factors,
-    rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31)
+    FlashinferMoeBackend, apply_flashinfer_per_tensor_scale_fp8,
+    build_flashinfer_fp8_cutlass_moe_prepare_finalize,
+    flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend,
+    register_moe_scaling_factors, rotate_flashinfer_fp8_moe_weights,
+    select_cutlass_fp8_gemm_impl, swap_w13_to_w31)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear, is_fp4_marlin_supported,
     prepare_fp4_layer_for_marlin, prepare_moe_fp4_layer_for_marlin)
@@ -49,11 +51,6 @@
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
-class FlashinferMoeBackend(Enum):
-    TENSORRT_LLM = "TensorRT-LLM"
-    CUTLASS = "CUTLASS"
-
-
 class ModelOptFp8Config(QuantizationConfig):
     """Config class for ModelOpt FP8."""
 
@@ -179,7 +176,7 @@ def get_quant_method(self, layer: torch.nn.Module,
         elif isinstance(layer, Attention):
             return ModelOptFp8KVCacheMethod(self)
         elif isinstance(layer, FusedMoE):
-            return ModelOptFp8MoEMethod(self, layer.moe_config)
+            return ModelOptFp8MoEMethod(self, layer)
         return None
 
 
@@ -278,18 +275,49 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptFp8Config,
-        moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> None:
-        super().__init__(moe)
+        super().__init__(layer.moe_config)
+        self.layer = layer
         self.quant_config = quant_config
         from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
             cutlass_fp8_supported)
         self.cutlass_fp8_supported = cutlass_fp8_supported()
-        self.flashinfer_moe_enabled = False
+        self.flashinfer_moe_backend: Optional[FlashinferMoeBackend] = None
+        self.fused_experts: Optional[
+            mk.FusedMoEModularKernel] = None  # type: ignore
         if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
-                "Using FlashInfer MoE FP8 kernels for ModelOptFp8MoEMethod.")
-            self.flashinfer_moe_enabled = True
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+            )
+
+    def maybe_make_prepare_finalize(
+        self,
+        moe: FusedMoEConfig,
+    ) -> Optional[mk.FusedMoEPrepareAndFinalize]:
+        if self.fused_experts is not None or \
+            self.flashinfer_moe_backend != FlashinferMoeBackend.CUTLASS:
+            return super().maybe_make_prepare_finalize(moe)
+
+        prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+            moe,
+            layer=self.layer,
+        )
+        logger.debug_once("%s", prepare_finalize.__class__.__name__)
+        return prepare_finalize
+
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        experts = select_cutlass_fp8_gemm_impl(
+            moe,
+            self.layer,
+        )
+        logger.debug_once("Using %s", experts.__class__.__name__)
+        return experts
 
     def create_weights(
         self,
@@ -433,11 +461,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_input_scale = Parameter(layer.w2_input_scale.max(),
                                              requires_grad=False)
 
-        if self.flashinfer_moe_enabled:
+        if self.flashinfer_moe_backend is not None:
             layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
-            rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
-                                              layer.w2_weight)
             register_moe_scaling_factors(layer)
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+                rotate_flashinfer_fp8_moe_weights(layer.w13_weight,
+                                                  layer.w2_weight)
 
     def apply(
         self,
@@ -461,14 +490,13 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `ModelOptFp8MoEMethod` yet.")
 
-        if self.flashinfer_moe_enabled:
-            assert activation == 'silu'
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            assert activation == 'silu', (
+                f"Expected 'silu' activation but got {activation}")
             assert not renormalize
             return apply_flashinfer_per_tensor_scale_fp8(
                 layer=layer,
@@ -495,6 +523,36 @@ def apply(
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
         )
+
+        if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            assert not renormalize
+            assert activation == 'silu', (
+                f"Expected 'silu' activation but got {activation}")
+            if self.fused_experts is not None:
+                return self.fused_experts(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    inplace=False,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                )
+            else:
+                return flashinfer_cutlass_moe_fp8(
+                    x,
+                    layer,
+                    topk_weights,
+                    topk_ids,
+                    inplace=False,
+                    activation=activation,
+                    global_num_experts=global_num_experts,
+                    expert_map=expert_map,
+                    apply_router_weight_on_input=apply_router_weight_on_input,
+                )
         from vllm.model_executor.layers.fused_moe.fused_moe import (
             fused_experts)
         return fused_experts(
@@ -951,20 +1009,10 @@ def __init__(
         self.flashinfer_moe_backend = None
 
         if self.allow_flashinfer:
-            flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-            if flashinfer_moe_backend == "throughput":
-                self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
-                logger.info_once("Using FlashInfer CUTLASS kernels for "
-                                 "ModelOptNvFp4FusedMoE.")
-            elif flashinfer_moe_backend == "latency":
-                self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
-                logger.info_once("Using FlashInfer TensorRT-LLM kernels for "
-                                 "ModelOptNvFp4FusedMoE.")
-            else:
-                allowed_backends = ["throughput", "latency"]
-                raise ValueError(
-                    f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
-                    f" expected one of {allowed_backends}")
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            logger.info_once(
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+                " for ModelOptNvFp4FusedMoE.")
 
     def maybe_make_prepare_finalize(
         self,
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index 278ee5232f47..9889808f0760 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -1,9 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import Enum
 from typing import Optional
 
 import torch
 
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+    FlashInferExperts)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+    FlashInferCutlassMoEPrepareAndFinalize)
+
+logger = init_logger(__name__)
+
+
+class FlashinferMoeBackend(Enum):
+    TENSORRT_LLM = "TensorRT-LLM"
+    CUTLASS = "CUTLASS"
+
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
 
@@ -144,3 +161,98 @@ def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
     layer.register_parameter(
         'output2_scales_scalar',
         torch.nn.Parameter(output2_scales, requires_grad=False))
+    layer.register_parameter(
+        'w2_input_scale_inv',
+        torch.nn.Parameter(1.0 / layer.w2_input_scale, requires_grad=False))
+
+
+def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+    moe: Optional[FusedMoEConfig],
+    layer: torch.nn.Module,
+) -> mk.FusedMoEPrepareAndFinalize:
+    """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
+    use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False
+    return FlashInferCutlassMoEPrepareAndFinalize(
+        use_dp, a1_gscale=layer.w13_input_scale)
+
+
+def select_cutlass_fp8_gemm_impl(
+    moe: Optional[FusedMoEConfig],
+    layer: torch.nn.Module,
+    out_dtype: Optional[torch.dtype] = None,
+) -> mk.FusedMoEPermuteExpertsUnpermute:
+    """Return a GEMM *experts* implementation for fused-MoE layers"""
+
+    from vllm.model_executor.models.llama4 import Llama4MoE
+    assert layer.custom_routing_function == Llama4MoE.custom_routing_function, \
+        "FusedMoE flashinfer kernels are only supported for Llama4"
+
+    if moe is not None:
+        return FlashInferExperts(
+            g1_alphas=layer.output1_scales_gate_scalar,
+            g2_alphas=layer.output2_scales_scalar,
+            a1_gscale=layer.w13_input_scale,
+            a2_gscale=layer.w2_input_scale_inv,
+            out_dtype=moe.in_dtype,
+            quant_dtype=torch.float8_e4m3fn,
+            ep_rank=moe.moe_parallel_config.ep_rank,
+            ep_size=moe.moe_parallel_config.ep_size,
+            tp_rank=moe.moe_parallel_config.tp_rank,
+            tp_size=moe.moe_parallel_config.tp_size,
+        )
+
+    assert out_dtype is not None, (
+        "If moe config is None, out_dtype must be passed")
+    return FlashInferExperts(
+        g1_alphas=layer.output1_scales_gate_scalar,
+        g2_alphas=layer.output2_scales_scalar,
+        a1_gscale=layer.w13_input_scale,
+        a2_gscale=layer.w2_input_scale_inv,
+        out_dtype=out_dtype,
+        quant_dtype=torch.float8_e4m3fn,
+    )
+
+
+def flashinfer_cutlass_moe_fp8(
+    hidden_states: torch.Tensor,
+    layer: torch.nn.Module,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    fused_experts = mk.FusedMoEModularKernel(
+        build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None,
+                                                          layer=layer),
+        select_cutlass_fp8_gemm_impl(moe=None,
+                                     layer=layer,
+                                     out_dtype=hidden_states.dtype))
+
+    return fused_experts(
+        hidden_states,
+        layer.w13_weight,
+        layer.w2_weight,
+        topk_weights,
+        topk_ids,
+        inplace=inplace,
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
+
+
+def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
+    flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
+    if flashinfer_moe_backend == "throughput":
+        return FlashinferMoeBackend.CUTLASS
+    elif flashinfer_moe_backend == "latency":
+        return FlashinferMoeBackend.TENSORRT_LLM
+
+    allowed_backends = ["throughput", "latency"]
+    raise ValueError(
+        f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
+        f" expected one of {allowed_backends}")

From e874de04dbd515e5c3c879fc6993820f8bf5da5e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 19 Aug 2025 18:07:30 -0400
Subject: [PATCH 208/233] [CI/Build] Replace lm-eval gsm8k tests with faster
 implementation (#23002)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   4 +-
 tests/evals/gsm8k/README.md                   |  35 +++
 tests/evals/gsm8k/__init__.py                 |   2 +
 .../Llama-3-8B-Instruct-nonuniform-CT.yaml    |   5 +
 .../Llama-3.2-1B-Instruct-INT8-CT.yaml        |   5 +
 .../gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml   |   5 +
 .../Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml   |   5 +
 tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml |   5 +
 tests/evals/gsm8k/configs/models-small.txt    |   5 +
 tests/evals/gsm8k/conftest.py                 |  66 +++++
 tests/evals/gsm8k/gsm8k_eval.py               | 252 ++++++++++++++++++
 tests/evals/gsm8k/test_gsm8k_correctness.py   |  90 +++++++
 12 files changed, 476 insertions(+), 3 deletions(-)
 create mode 100644 tests/evals/gsm8k/README.md
 create mode 100644 tests/evals/gsm8k/__init__.py
 create mode 100644 tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
 create mode 100644 tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
 create mode 100644 tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
 create mode 100644 tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 create mode 100644 tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
 create mode 100644 tests/evals/gsm8k/configs/models-small.txt
 create mode 100644 tests/evals/gsm8k/conftest.py
 create mode 100644 tests/evals/gsm8k/gsm8k_eval.py
 create mode 100644 tests/evals/gsm8k/test_gsm8k_correctness.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 781b8e0fa009..2f7f1db75bfb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -451,13 +451,11 @@ steps:
 
 - label: LM Eval Small Models # 53min
   mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
   commands:
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
 - label: OpenAI API correctness
   mirror_hardwares: [amdexperimental]
diff --git a/tests/evals/gsm8k/README.md b/tests/evals/gsm8k/README.md
new file mode 100644
index 000000000000..58572c3a6fbc
--- /dev/null
+++ b/tests/evals/gsm8k/README.md
@@ -0,0 +1,35 @@
+# GSM8K Accuracy Evaluation
+
+This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control.
+
+## Usage
+
+### Run tests with pytest (like buildkite)
+
+```bash
+pytest -s -v tests/gsm8k/test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
+```
+
+### Run standalone evaluation script
+
+```bash
+# Start vLLM server first
+vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
+
+# Run evaluation
+python tests/gsm8k/gsm8k_eval.py --port 8000
+```
+
+## Configuration Format
+
+Model configs in `configs/` directory use this YAML format:
+
+```yaml
+model_name: "Qwen/Qwen2.5-1.5B-Instruct"
+accuracy_threshold: 0.54  # Minimum expected accuracy
+num_questions: 1319       # Number of questions (default: full test set)
+num_fewshot: 5            # Few-shot examples from train set
+max_model_len: 4096       # Model context length
+```
diff --git a/tests/evals/gsm8k/__init__.py b/tests/evals/gsm8k/__init__.py
new file mode 100644
index 000000000000..0fec1fe5bcdf
--- /dev/null
+++ b/tests/evals/gsm8k/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
new file mode 100644
index 000000000000..caa0448f23d4
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
@@ -0,0 +1,5 @@
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+accuracy_threshold: 0.74
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
new file mode 100644
index 000000000000..615aa69a2d2b
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
@@ -0,0 +1,5 @@
+model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
+accuracy_threshold: 0.31
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
new file mode 100644
index 000000000000..c5dbceeeb2b4
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -0,0 +1,5 @@
+model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
+accuracy_threshold: 0.45
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
new file mode 100644
index 000000000000..5319ada30f64
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -0,0 +1,5 @@
+model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
+accuracy_threshold: 0.60
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
new file mode 100644
index 000000000000..c39fb979d98a
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
@@ -0,0 +1,5 @@
+model_name: "Qwen/Qwen3-0.6B-FP8"
+accuracy_threshold: 0.375
+num_questions: 1319
+num_fewshot: 5
+max_model_len: 4096
\ No newline at end of file
diff --git a/tests/evals/gsm8k/configs/models-small.txt b/tests/evals/gsm8k/configs/models-small.txt
new file mode 100644
index 000000000000..afd1065b9191
--- /dev/null
+++ b/tests/evals/gsm8k/configs/models-small.txt
@@ -0,0 +1,5 @@
+Qwen3-0.6B-FP8.yaml
+Llama-3.2-1B-Instruct-INT8-CT.yaml
+Llama-3-8B-Instruct-nonuniform-CT.yaml
+Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+Qwen1.5-MoE-W4A16-CT.yaml
diff --git a/tests/evals/gsm8k/conftest.py b/tests/evals/gsm8k/conftest.py
new file mode 100644
index 000000000000..d96b0a66ede2
--- /dev/null
+++ b/tests/evals/gsm8k/conftest.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+
+
+def pytest_addoption(parser):
+    """Add custom command line options."""
+    parser.addoption("--config-list-file",
+                     default="configs/models-small.txt",
+                     help="File containing list of config files to test")
+    parser.addoption("--tp-size",
+                     default=1,
+                     type=int,
+                     help="Tensor parallel size")
+
+
+def pytest_generate_tests(metafunc):
+    """Generate test parameters from config files."""
+    if "config_filename" in metafunc.fixturenames:
+        config_list_file = metafunc.config.getoption("--config-list-file")
+        tp_size = metafunc.config.getoption("--tp-size")
+
+        # Handle both relative and absolute paths
+        config_list_path = Path(config_list_file)
+        if not config_list_path.is_absolute():
+            # If relative, try relative to test directory first
+            test_dir_path = Path(__file__).parent / config_list_file
+            if test_dir_path.exists():
+                config_list_path = test_dir_path
+            else:
+                # Try relative to current working directory
+                config_list_path = Path.cwd() / config_list_file
+
+        print(f"Looking for config list at: {config_list_path}")
+
+        config_files = []
+        if config_list_path.exists():
+            # Determine config directory (same directory as the list file)
+            config_dir = config_list_path.parent
+
+            with open(config_list_path) as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith("#"):
+                        config_path = config_dir / line
+                        print(f"Checking config file: {config_path}")
+                        if config_path.exists():
+                            config_files.append(config_path)
+                            print(f"  ✓ Found: {config_path}")
+                        else:
+                            print(f"  ✗ Missing: {config_path}")
+        else:
+            print(f"Config list file not found: {config_list_path}")
+
+        # Generate test parameters
+        if config_files:
+            metafunc.parametrize(["config_filename", "tp_size"],
+                                 [(config_file, int(tp_size))
+                                  for config_file in config_files],
+                                 ids=[
+                                     f"{config_file.stem}-tp{tp_size}"
+                                     for config_file in config_files
+                                 ])
+        else:
+            print("No config files found, test will be skipped")
diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py
new file mode 100644
index 000000000000..7d0ce25f75dd
--- /dev/null
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Isolated GSM8K evaluation script for vLLM serve endpoint.
+"""
+
+import argparse
+import ast
+import asyncio
+import json
+import os
+import time
+from collections.abc import Generator
+from typing import Optional, Union
+
+import aiohttp
+import numpy as np
+import regex as re
+import requests
+from tqdm.asyncio import tqdm
+
+INVALID = -9999999
+
+
+def download_and_cache_file(url: str, filename: Optional[str] = None) -> str:
+    """Download and cache a file from a URL."""
+    if filename is None:
+        filename = os.path.join("/tmp", url.split("/")[-1])
+
+    if os.path.exists(filename):
+        return filename
+
+    print(f"Downloading from {url} to {filename}")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+
+    with open(filename, "wb") as f:
+        for chunk in response.iter_content(chunk_size=1024):
+            f.write(chunk)
+
+    return filename
+
+
+def load_gsm8k_data() -> tuple[list[dict], list[dict]]:
+    """Load GSM8K train and test data"""
+    train_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl"
+    test_url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+
+    train_file = download_and_cache_file(train_url)
+    test_file = download_and_cache_file(test_url)
+
+    train_data = list(read_jsonl(train_file))
+    test_data = list(read_jsonl(test_file))
+
+    return train_data, test_data
+
+
+def read_jsonl(filename: str) -> Generator[dict, None, None]:
+    """Read a JSONL file."""
+    with open(filename) as fin:
+        for line in fin:
+            if not line.startswith("#"):
+                yield json.loads(line)
+
+
+def get_answer_value(answer_str: str) -> int:
+    """Extract the numerical answer from the response."""
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+async def call_vllm_api(session: aiohttp.ClientSession,
+                        prompt: str,
+                        temperature: float,
+                        max_tokens: int,
+                        stop: Optional[list[str]] = None,
+                        url: Optional[str] = None,
+                        seed: Optional[int] = None) -> str:
+    """Call vLLM's OpenAI-compatible completions endpoint."""
+    data = {
+        "prompt": prompt,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        "stop": stop,
+    }
+    if seed is not None:
+        data["seed"] = seed
+
+    try:
+        async with session.post(f"{url}/v1/completions",
+                                json=data) as response:
+            response.raise_for_status()
+            result = await response.json()
+            return result["choices"][0]["text"]
+    except Exception as e:
+        print(f"Error calling vLLM API: {e}")
+        return ""
+
+
+def evaluate_gsm8k(num_questions: int = 1319,
+                   num_shots: int = 5,
+                   max_tokens: int = 256,
+                   host: str = "http://127.0.0.1",
+                   port: int = 8000,
+                   temperature: float = 0.0,
+                   seed: Optional[int] = 42) -> dict[str, Union[float, int]]:
+    """
+    Evaluate GSM8K accuracy using vLLM serve endpoint.
+    
+    Returns dict with accuracy, invalid_rate, latency, etc.
+    """
+    base_url = f"{host}:{port}"
+
+    # Load GSM8K train and test data
+    train_data, test_data = load_gsm8k_data()
+
+    # Limit to available test questions
+    num_questions = min(num_questions, len(test_data))
+
+    # Build few-shot examples from train split (like lm-eval does)
+    few_shot_examples = ""
+    for i in range(num_shots):
+        few_shot_examples += (f"Question: {train_data[i]['question']}\n"
+                              f"Answer: {train_data[i]['answer']}\n\n")
+
+    # Prepare test questions and labels from test split
+    questions = []
+    labels = []
+    for i in range(num_questions):
+        questions.append(f"Question: {test_data[i]['question']}\nAnswer:")
+        labels.append(get_answer_value(test_data[i]["answer"]))
+
+    assert all(label != INVALID for label in labels), "Some labels are invalid"
+
+    # Run evaluation
+    async def run_async_evaluation():
+        states: list[str] = [""] * num_questions
+
+        async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
+            prompt = few_shot_examples + questions[i]
+            answer = await call_vllm_api(
+                session=session,
+                prompt=prompt,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                stop=["Question", "Assistant:", "<|separator|>"],
+                url=base_url,
+                seed=seed,
+            )
+            states[i] = answer
+            return answer
+
+        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
+                total=600)) as session:
+            tasks = [get_answer(session, i) for i in range(num_questions)]
+            await tqdm.gather(*tasks, desc="Evaluating")
+
+        return states
+
+    print(f"Running GSM8K evaluation: {num_questions} questions, "
+          f"{num_shots}-shot")
+
+    tic = time.perf_counter()
+    states = asyncio.run(run_async_evaluation())
+    latency = time.perf_counter() - tic
+
+    # Compute metrics
+    preds = [get_answer_value(state) for state in states]
+    accuracy = np.mean(np.array(preds) == np.array(labels))
+    invalid_rate = np.mean(np.array(preds) == INVALID)
+
+    result = {
+        "accuracy": accuracy,
+        "invalid_rate": invalid_rate,
+        "latency": latency,
+        "questions_per_second": num_questions / latency,
+        "num_questions": num_questions,
+        "num_shots": num_shots,
+        "max_tokens": max_tokens,
+        "timestamp": time.time(),
+    }
+
+    return result
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="GSM8K evaluation for vLLM serve")
+    parser.add_argument("--num-shots",
+                        type=int,
+                        default=5,
+                        help="Number of few-shot examples")
+    parser.add_argument("--num-questions",
+                        type=int,
+                        default=1319,
+                        help="Number of questions to evaluate")
+    parser.add_argument("--max-tokens",
+                        type=int,
+                        default=256,
+                        help="Max tokens for generation")
+    parser.add_argument("--host",
+                        type=str,
+                        default="http://127.0.0.1",
+                        help="Host URL")
+    parser.add_argument("--port", type=int, default=8000, help="Port number")
+    parser.add_argument("--temperature",
+                        type=float,
+                        default=0.0,
+                        help="Temperature for generation")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=42,
+                        help="Random seed for reproducibility")
+    parser.add_argument("--save-results",
+                        type=str,
+                        help="Save results to JSON file")
+
+    args = parser.parse_args()
+
+    result = evaluate_gsm8k(
+        num_questions=args.num_questions,
+        num_shots=args.num_shots,
+        max_tokens=args.max_tokens,
+        host=args.host,
+        port=args.port,
+        temperature=args.temperature,
+        seed=args.seed,
+    )
+
+    # Print results to terminal
+    print("\nResults:")
+    print(f"Accuracy: {result['accuracy']:.3f}")
+    print(f"Invalid responses: {result['invalid_rate']:.3f}")
+    print(f"Total latency: {result['latency']:.3f} s")
+    print(f"Questions per second: {result['questions_per_second']:.3f}")
+
+    # Optional file saving
+    if args.save_results:
+        with open(args.save_results, "w") as f:
+            json.dump(result, f, indent=2)
+        print(f"Results saved to {args.save_results}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py
new file mode 100644
index 000000000000..a12dd49dbea6
--- /dev/null
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+GSM8K evaluation using vLLM server and isolated GSM8K script.
+Replacement for lm-eval-harness with better performance and control.
+
+Usage:
+pytest -s -v test_gsm8k_correctness.py \
+    --config-list-file=configs/models-small.txt \
+    --tp-size=1
+"""
+
+import yaml
+
+from tests.utils import RemoteOpenAIServer
+
+from .gsm8k_eval import evaluate_gsm8k
+
+RTOL = 0.08  # Relative tolerance for accuracy comparison
+
+
+def launch_gsm8k_eval(eval_config, server_url, tp_size):
+    """Launch GSM8K evaluation using our isolated script."""
+    # Extract host and port from server URL
+    if "://" in server_url:
+        server_url = server_url.split("://")[1]
+
+    host_port = server_url.split("/")[0]  # Remove path if present
+    if ":" in host_port:
+        host, port = host_port.split(":")
+        port = int(port)
+    else:
+        host = host_port
+        port = 8000
+
+    # Add http:// prefix if not present
+    if not host.startswith("http"):
+        host = f"http://{host}"
+
+    # Run GSM8K evaluation
+    results = evaluate_gsm8k(
+        num_questions=eval_config["num_questions"],
+        num_shots=eval_config["num_fewshot"],
+        host=host,
+        port=port,
+    )
+
+    return results
+
+
+def test_gsm8k_correctness_param(config_filename, tp_size):
+    """Test GSM8K correctness for a given model configuration."""
+    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
+
+    # Server arguments
+    server_args = [
+        "--max-model-len",
+        str(eval_config.get("max_model_len", 4096)),
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--tensor-parallel-size",
+        str(tp_size),
+    ]
+
+    # Launch server and run evaluation
+    with RemoteOpenAIServer(eval_config["model_name"],
+                            server_args,
+                            max_wait_seconds=480) as remote_server:
+        server_url = remote_server.url_for("v1")
+
+        results = launch_gsm8k_eval(eval_config, server_url, tp_size)
+
+        # Check accuracy against threshold
+        measured_accuracy = results["accuracy"]
+        expected_accuracy = eval_config["accuracy_threshold"]
+
+        print(f"GSM8K Results for {eval_config['model_name']}:")
+        print(f"  Accuracy: {measured_accuracy:.3f}")
+        print(f"  Expected: {expected_accuracy:.3f}")
+        print(f"  Questions: {results['num_questions']}")
+        print(f"  Invalid rate: {results['invalid_rate']:.3f}")
+        print(f"  Latency: {results['latency']:.1f}s")
+        print(f"  QPS: {results['questions_per_second']:.1f}")
+
+        # Verify accuracy is within tolerance
+        assert measured_accuracy >= expected_accuracy - RTOL, (
+            f"Accuracy too low: {measured_accuracy:.3f} < "
+            f"{expected_accuracy:.3f} - {RTOL:.3f}")
+
+        print(f"✅ GSM8K test passed for {eval_config['model_name']}")

From 661df590a64d2700fece841b48f4c5975051d71a Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 19 Aug 2025 18:17:08 -0400
Subject: [PATCH 209/233] [BugFix] fix CUTLASS MLA full cudagraph  (#23200)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/mla/cutlass_mla.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 6937ce10ac15..0b581dea0402 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -21,7 +21,7 @@
 
 class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
     # enable full CUDA Graph support for decode-only capture
-    attn_cudagraph_support: ClassVar[
+    cudagraph_support: ClassVar[
         AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
 
 
From 3c95c6272466e1309bb12fbbe03c264adfd5f540 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 19 Aug 2025 16:42:31 -0700
Subject: [PATCH 210/233] [Benchmarks] Add video inputs to ShareGPTDataset. 
 (#23199)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 benchmarks/README.md            | 41 ++++++++++++++++++++++++++++++---
 benchmarks/benchmark_dataset.py | 38 +++++++++++++++++++++++++++++-
 vllm/benchmarks/datasets.py     | 40 ++++++++++++++++++++++++++++++--
 3 files changed, 113 insertions(+), 6 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 69d32e222819..176b40212978 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -32,6 +32,14 @@ become available.
         <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
         <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
       </td>
+    </tr>
+        <tr>
+      <td><strong>ShareGPT4Video (Video)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
+      </td>
     </tr>
     <tr>
       <td><strong>BurstGPT</strong></td>
@@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \  
+  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -246,7 +254,7 @@ vllm bench serve \
 ```bash
 vllm bench serve \
   --backend openai-chat \
-  --endpoint-type openai-chat \  
+  --endpoint-type openai-chat \
   --model Qwen/Qwen2-VL-7B-Instruct \
   --endpoint /v1/chat/completions \
   --dataset-name hf \
@@ -612,7 +620,7 @@ vllm bench serve \
   --prefix-repetition-prefix-len 512 \
   --prefix-repetition-suffix-len 128 \
   --prefix-repetition-num-prefixes 5 \
-  --prefix-repetition-output-len 128 
+  --prefix-repetition-output-len 128
 ```
 
 </details>
@@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \
   --endpoint /v1/chat/completion
 ```
 
+### Videos (ShareGPT4Video)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"video": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4video/videos
+```
+
+Send requests with videos:
+
+```bash
+python benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
 </details>
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index c62934ed94cb..e1a856026c4a 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
     )
 
 
+def process_video(video: Any) -> Mapping[str, Any]:
+    """
+    Process a single video input and return a multimedia content dictionary.
+
+    Supports the following input types:
+
+    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+       containing raw video data.
+
+    2. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(video, dict) and "bytes" in video:
+        video_bytes = video["bytes"]
+        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        return {
+            "type": "video_url",
+            "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+        }
+
+    if isinstance(video, str):
+        video_url = (
+            video if video.startswith(("http://", "file://")) else f"file://{video}"
+        )
+        return {"type": "video_url", "video_url": {"url": video_url}}
+
+    raise ValueError(
+        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
+    )
+
+
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@@ -451,9 +486,10 @@ def sample(
                 skip_min_output_len_check=output_len is not None,
             ):
                 continue
-            # TODO: Also support ShareGPT4Video.
             if image_path := entry.get("image"):
                 mm_content = process_image(image_path)
+            elif video_path := entry.get("video"):
+                mm_content = process_video(video_path)
             else:
                 mm_content = None
             if enable_multimodal_chat:
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 3532a083fb4a..f4fbfad2d1d5 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
     """
     Process a single image input and return a multimedia content dictionary.
 
-    Supports three input types:
+    Supports the following input types:
 
     1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
        containing raw image data.  - Loads the bytes as a PIL.Image.Image.
@@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
                      " or str or dictionary with raw image bytes.")
 
 
+def process_video(video: Any) -> Mapping[str, Any]:
+    """
+    Process a single video input and return a multimedia content dictionary.
+
+    Supports the following input types:
+
+    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
+       containing raw video data.
+
+    2. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(video, dict) and 'bytes' in video:
+        video_bytes = video['bytes']
+        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
+        return {
+            "type": "video_url",
+            "video_url": {
+                "url": f"data:video/mp4;base64,{video_base64}"
+            },
+        }
+
+    if isinstance(video, str):
+        video_url = (video if video.startswith(
+            ("http://", "file://")) else f"file://{video}")
+        return {"type": "video_url", "video_url": {"url": video_url}}
+
+    raise ValueError(
+        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
+    )
+
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@@ -474,9 +509,10 @@ def sample(
                                      skip_min_output_len_check=output_len
                                      is not None):
                 continue
-            # TODO: Also support ShareGPT4Video.
             if image_path := entry.get("image"): 
                 mm_content = process_image(image_path) 
+            elif video_path := entry.get("video"): 
+                mm_content = process_video(video_path)
             else: 
                 mm_content = None
             if enable_multimodal_chat:

From 1288ca25da0965f023295ee1e2a3385ce94d86e8 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 19 Aug 2025 20:39:28 -0400
Subject: [PATCH 211/233] [Quantization] Bump Compressed Tensors Version
 (#23202)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 requirements/common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 3c3ac0abf50f..365457436faa 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -39,7 +39,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.10.2 # required for compressed-tensors
+compressed-tensors == 0.11.0 # required for compressed-tensors
 depyf==0.19.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files

From 42b5326ad380b9ba8de496b7ce39ce77aff34aab Mon Sep 17 00:00:00 2001
From: 633WHU <cliu_whu@yeah.net>
Date: Wed, 20 Aug 2025 09:25:59 +0800
Subject: [PATCH 212/233] [Core] Optimize scheduler request removal for single
 completions (#21917)

Signed-off-by: chiliu <chiliu@paypal.com>
Signed-off-by: chiliu <cliu_whu@yeah.net>
Co-authored-by: chiliu <chiliu@paypal.com>
---
 vllm/v1/core/sched/scheduler.py | 14 ++++++--------
 vllm/v1/core/sched/utils.py     | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index b3defa443186..f9a7e2101407 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -25,7 +25,7 @@
                                        SchedulerOutput)
 from vllm.v1.core.sched.request_queue import (SchedulingPolicy,
                                               create_request_queue)
-from vllm.v1.core.sched.utils import check_stop
+from vllm.v1.core.sched.utils import check_stop, remove_all
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -872,9 +872,7 @@ def update_from_output(
 
         # Remove the stopped requests from the running and waiting queues.
         if stopped_running_reqs:
-            self.running = [
-                req for req in self.running if req not in stopped_running_reqs
-            ]
+            self.running = remove_all(self.running, stopped_running_reqs)
         if stopped_preempted_reqs:
             # This is a rare case and unlikely to impact performance.
             self.waiting.remove_requests(stopped_preempted_reqs)
@@ -1000,7 +998,7 @@ def finish_requests(
         else:
             request_ids = set(request_ids)
 
-        running_requests_to_remove = []
+        running_requests_to_remove = set()
         waiting_requests_to_remove = []
         valid_requests = []
 
@@ -1013,13 +1011,13 @@ def finish_requests(
 
             valid_requests.append(request)
             if request.status == RequestStatus.RUNNING:
-                running_requests_to_remove.append(request)
+                running_requests_to_remove.add(request)
             else:
                 waiting_requests_to_remove.append(request)
 
         # Remove all requests from queues at once for better efficiency
-        for request in running_requests_to_remove:
-            self.running.remove(request)
+        if running_requests_to_remove:
+            self.running = remove_all(self.running, running_requests_to_remove)
         if waiting_requests_to_remove:
             self.waiting.remove_requests(waiting_requests_to_remove)
 
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
index 42ec95091f96..42d3e5c68b4c 100644
--- a/vllm/v1/core/sched/utils.py
+++ b/vllm/v1/core/sched/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
 from typing import Optional
 
 import torch
@@ -7,6 +8,38 @@
 from vllm.v1.request import Request, RequestStatus
 
 
+def remove_all(lst: list, items_to_remove: set) -> list:
+    """Remove all items from a list that are in the items_to_remove set.
+    
+    This method optimizes for the common case of removing a single item,
+    falling back to list comprehension for multiple items.
+    
+    Args:
+        lst: The list to remove items from
+        items_to_remove: Set of items to remove
+    
+    Returns:
+        Either the modified original list (for single item removal) or
+        a new list (for multiple item removal). Callers should use the
+        returned value.
+    
+    Note:
+        For single item removal, this modifies the original list in-place
+        and returns it. For multiple items, it creates and returns a new list.
+    """
+    if not items_to_remove:
+        return lst
+
+    if len(items_to_remove) == 1:
+        # Fast path for single item removal (most common case)
+        item = next(iter(items_to_remove))
+        with contextlib.suppress(ValueError):
+            lst.remove(item)
+        return lst
+    # For multiple items, use list comprehension
+    return [item for item in lst if item not in items_to_remove]
+
+
 def check_stop(request: Request,
                max_model_len: int,
                pooler_output: Optional[torch.Tensor] = None) -> bool:

From 876a74a866f6f2a665c10b3454cfc4868e6da673 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Tue, 19 Aug 2025 22:18:52 -0400
Subject: [PATCH 213/233] [CI Perf] Only test bfloat16 for
 tests/compile/test_fusion_all_reduce.py (#23132)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/compile/test_fusion_all_reduce.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py
index 4c3cf6c2a10c..dd31e0db1f59 100644
--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -148,7 +148,7 @@ def ops_in_model_before(self):
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [8])
 @pytest.mark.parametrize("hidden_size", [16])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
                     reason="Only test on CUDA")
 @pytest.mark.skipif(

From 190d3701d145cca45c2b65b30250d49663d62cd5 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Tue, 19 Aug 2025 19:32:47 -0700
Subject: [PATCH 214/233] [Core] Add torch profiler CPU traces for AsyncLLM.
 (#21794)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 vllm/envs.py                |  6 ++++--
 vllm/v1/engine/async_llm.py | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 6b9e05244c1d..f7f63104334c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -668,8 +668,10 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_LORA_RESOLVER_CACHE_DIR":
     lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None),
 
-    # Enables torch profiler if set. Path to the directory where torch profiler
-    # traces are saved. Note that it must be an absolute path.
+    # Enables torch profiler if set.
+    # Both AsyncLLM's CPU traces as well as workers'
+    # traces (CPU & GPU) will be saved under this directory.
+    # Note that it must be an absolute path.
     "VLLM_TORCH_PROFILER_DIR":
     lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
              .path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 664fec31a4da..342d7b24f8e9 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
+import os
+import socket
 import time
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from copy import copy
 from typing import Any, Optional, Union
 
 import numpy as np
+import torch
 
 import vllm.envs as envs
 from vllm.config import ModelConfig, VllmConfig
@@ -144,6 +147,26 @@ def __init__(
         except RuntimeError:
             pass
 
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            logger.info(
+                "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
+                envs.VLLM_TORCH_PROFILER_DIR)
+            worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                ],
+                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    envs.VLLM_TORCH_PROFILER_DIR,
+                    worker_name=worker_name,
+                    use_gzip=True))
+        else:
+            logger.info(
+                "Torch profiler disabled. AsyncLLM CPU traces will not be collected."  # noqa: E501
+            )
+            self.profiler = None
+
     @classmethod
     @deprecate_kwargs(
         "disable_log_requests",
@@ -562,10 +585,16 @@ async def check_health(self) -> None:
             raise self.dead_error
 
     async def start_profile(self) -> None:
-        await self.engine_core.profile_async(True)
+        coros = [self.engine_core.profile_async(True)]
+        if self.profiler is not None:
+            coros.append(asyncio.to_thread(self.profiler.start))
+        await asyncio.gather(*coros)
 
     async def stop_profile(self) -> None:
-        await self.engine_core.profile_async(False)
+        coros = [self.engine_core.profile_async(False)]
+        if self.profiler is not None:
+            coros.append(asyncio.to_thread(self.profiler.stop))
+        await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
         self.processor.mm_registry.reset_processor_cache(self.model_config)

From ea1c0aab4e795d600c7df0b87becf1e50dd54c24 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 10:33:41 +0800
Subject: [PATCH 215/233] [Doc] Update V1 status of various pooling models
 (#23189)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md              | 26 ++++++++++----------
 tests/models/language/pooling/test_gritlm.py |  9 ++++---
 vllm/model_executor/models/gritlm.py         |  6 ++---
 vllm/model_executor/models/interfaces.py     | 11 ++++++---
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 1d165fa6f16b..7908e4238710 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -363,7 +363,7 @@ th {
 | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ |
-| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
@@ -436,17 +436,17 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | |
-| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | | ✅︎ |
-| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | |
-| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  |  |
-| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  |  |
-| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  |  |
-| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  |  |
+| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
+| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
+| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. |  |  | ✅︎ |
+| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. |  |  | ✅︎ |
+| `ModernBertModel`<sup>C</sup> | ModernBERT-based | `Alibaba-NLP/gte-modernbert-base`, etc. |  |  | ✅︎ |
+| `NomicBertModel`<sup>C</sup> | Nomic BERT | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. |  |  | ✅︎ |
 | `LlamaModel`<sup>C</sup>, `LlamaForCausalLM`<sup>C</sup>, `MistralModel`<sup>C</sup>, etc. | Llama-based | `intfloat/e5-mistral-7b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2Model`<sup>C</sup>, `Qwen2ForCausalLM`<sup>C</sup> | Qwen2-based | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3Model`<sup>C</sup>, `Qwen3ForCausalLM`<sup>C</sup> | Qwen3-based | `Qwen/Qwen3-Embedding-0.6B`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | |
+| `RobertaModel`, `RobertaForMaskedLM` | RoBERTa-based | `sentence-transformers/all-roberta-large-v1`, etc. | | | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
 <sup>C</sup> Automatically converted into an embedding model via `--convert embed`. ([details](./pooling_models.md#model-conversion))  
@@ -476,7 +476,7 @@ These models primarily support the [`LLM.classify`](./pooling_models.md#llmclass
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | |
+| `JambaForSequenceClassification` | Jamba | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPT2ForSequenceClassification` | GPT2 | `nie3e/sentiment-polish-gpt2-small` | | | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
@@ -493,12 +493,12 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
-| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | |
+| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
-| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | |
-| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | |
+| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | ✅︎ |
 | `*Model`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | N/A | \* | \* | \* |
 
 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index d21987571cba..17a55d916b1f 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -14,6 +14,7 @@
 
 MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
 MAX_MODEL_LEN = 4000
+ATOL = 0.002
 
 
 def _arr(arr):
@@ -97,16 +98,16 @@ def get_test_data():
 
 def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
     cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
-    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001)
+    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=ATOL)
 
     cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
-    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001)
+    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=ATOL)
 
     cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
-    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001)
+    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=ATOL)
 
     cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
-    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001)
+    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=ATOL)
 
 
 def test_gritlm_offline_embedding(vllm_runner):
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 9e7490e3c4f0..3f6790269ae6 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -20,7 +20,7 @@
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import SupportsV0Only
+from .interfaces import default_pooling_type
 
 logger = init_logger(__name__)
 
@@ -215,7 +215,8 @@ def forward(
         return build_output(pooled_data)
 
 
-class GritLM(LlamaForCausalLM, SupportsV0Only):
+@default_pooling_type("MEAN")
+class GritLM(LlamaForCausalLM):
     """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
 
     The class inherits from LlamaForCausalLM and provides a custom pooling
@@ -241,7 +242,6 @@ def __init__(
         prefix: str = "",
         **kwargs,
     ) -> None:
-        # Use full attention for pooling (this is why V1 is not supported yet)
         if vllm_config.model_config.runner_type == "pooling":
             hf_config = vllm_config.model_config.hf_config
             hf_config.is_causal = False
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c425488f834b..9415e67924e7 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping, MutableSequence
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
-                    Union, overload, runtime_checkable)
+                    TypeVar, Union, overload, runtime_checkable)
 
 import numpy as np
 import torch
@@ -641,11 +641,14 @@ def supports_cross_encoding(
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
-def default_pooling_type(pooling_type: str) -> object:
+_T = TypeVar("_T", bound=type[torch.nn.Module])
+
+
+def default_pooling_type(pooling_type: str):
     """Set default_pooling_type decorator. """
 
-    def func(model: object):
-        model.default_pooling_type = pooling_type
+    def func(model: _T) -> _T:
+        model.default_pooling_type = pooling_type  # type: ignore
         return model
 
     return func

From e85d1a224e05c75191bd40ce75be800a918ecdb5 Mon Sep 17 00:00:00 2001
From: Zebing Lin <linzebing1995@gmail.com>
Date: Tue, 19 Aug 2025 22:57:47 -0400
Subject: [PATCH 216/233] [Attention] Optimize
 make_local_attention_virtual_batches for Flash Attention (#23185)

Signed-off-by: linzebing <linzebing1995@gmail.com>
---
 vllm/v1/attention/backends/utils.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 5e6bc331835b..94dd3d2629eb 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -464,8 +464,9 @@ def make_local_attention_virtual_batches(
         attn_chunk_size)[arange > 0]
 
     # convert from q_seqlens to cu_seqlens_q
-    cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0))\
-        .astype(np.int32)
+    cu_seqlens_q_local = np.empty(virtual_batches + 1, dtype=np.int32)
+    np.cumsum(seqlens_q_local, out=cu_seqlens_q_local[1:])
+    cu_seqlens_q_local[0] = 0
 
     # compute the seqlens_k_local,
     #  basically a full local attention block for all but the last block in each
@@ -508,11 +509,10 @@ def make_local_attention_virtual_batches(
     #     [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
     #     [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
     #   ]
-    block_indices= np.broadcast_to(
-        np.arange(pages_per_local_batch, dtype=np.int32),
-        (virtual_batches, pages_per_local_batch)) \
-            + np.expand_dims(block_starts, axis=1)
-    block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1)
+    block_indices = (block_starts[:, None] +
+                     np.arange(pages_per_local_batch, dtype=np.int32))
+    block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] -
+                                                   1)
     batch_indices = np.repeat(np.arange(actual_batch_size, dtype=np.int32),
                               local_blocks * pages_per_local_batch)
     block_table_local = block_table[batch_indices, block_indices]\

From 622f95f7d4aec4dd1889c97e878e64faf291a723 Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Tue, 19 Aug 2025 20:14:32 -0700
Subject: [PATCH 217/233] Fix a performance comparison issue in Benchmark Suite
 (#23047)

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
---
 .../scripts/compare-json-results.py           | 146 ++++++++++++++----
 1 file changed, 119 insertions(+), 27 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 12c4ba6aa69a..50431d0cd4c5 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -3,44 +3,129 @@
 import argparse
 import json
 import os
+from importlib import util
 
 import pandas as pd
 
+plotly_found = util.find_spec("plotly.express") is not None
+
 
 def compare_data_columns(
     files, name_column, data_column, info_cols, drop_column, debug=False
 ):
-    print("\ncompare_data_column: " + data_column)
+    """
+    Align concatenation by keys derived from info_cols instead of row order.
+    - Pick one canonical key list: subset of info_cols present in ALL files.
+    - For each file: set index to those keys, aggregate duplicates
+    - (mean for metric, first for names).
+    - Concat along axis=1 (indexes align), then reset_index so callers can
+    - group by columns.
+    - If --debug, add a <file_label>_name column per file.
+    """
+    print("\ncompare_data_column:", data_column)
+
     frames = []
     raw_data_cols = []
     compare_frames = []
+
+    # 1) choose a canonical key list from info_cols that exists in ALL files
+    cols_per_file = []
+    for f in files:
+        try:
+            df_tmp = pd.read_json(f, orient="records")
+        except Exception as err:
+            raise ValueError(f"Failed to read {f}") from err
+        cols_per_file.append(set(df_tmp.columns))
+
+    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
+    if not key_cols:
+        # soft fallback: use any info_cols present in the first file
+        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
+    if not key_cols:
+        raise ValueError(
+            "No common key columns found from info_cols across the input files."
+        )
+
+    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
+    meta_added = False
+
     for file in files:
-        data_df = pd.read_json(file)
-        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
-        # Show all info columns in the first couple columns
-        if not frames:
-            for col in info_cols:
-                if col not in serving_df.columns:
-                    print(f"Skipping missing column: {col}")
-                    continue
-                frames.append(serving_df[col])
-        # only show test name under debug mode
-        if debug is True:
-            serving_df = serving_df.rename(columns={name_column: file + "_name"})
-            frames.append(serving_df[file + "_name"])
-
-        file = "/".join(file.split("/")[:-1])
-        serving_df = serving_df.rename(columns={data_column: file})
-        frames.append(serving_df[file])
-        raw_data_cols.append(file)
-        compare_frames.append(serving_df[file])
+        df = pd.read_json(file, orient="records")
+
+        # Keep rows that actually have the compared metric (same as original behavior)
+        if drop_column in df.columns:
+            df = df.dropna(subset=[drop_column], ignore_index=True)
+
+        # Stabilize numeric key columns (harmless if missing)
+        for c in (
+            "Input Len",
+            "Output Len",
+            "TP Size",
+            "PP Size",
+            "# of max concurrency.",
+            "qps",
+        ):
+            if c in df.columns:
+                df[c] = pd.to_numeric(df[c], errors="coerce")
+
+        # Ensure all key columns exist
+        for c in key_cols:
+            if c not in df.columns:
+                df[c] = pd.NA
+
+        # Set index = key_cols and aggregate duplicates → unique MultiIndex
+        df_idx = df.set_index(key_cols, drop=False)
+
+        # meta (key columns), unique per key
+        meta = df_idx[key_cols]
+        if not meta.index.is_unique:
+            meta = meta.groupby(level=key_cols, dropna=False).first()
+
+        # metric series for this file, aggregated to one row per key
+        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
+        s = df_idx[data_column]
+        if not s.index.is_unique:
+            s = s.groupby(level=key_cols, dropna=False).mean()
+        s.name = file_label  # column label like original
+
+        # add meta once (from first file) so keys are the leftmost columns
+        if not meta_added:
+            frames.append(meta)
+            meta_added = True
+
+        # (NEW) debug: aligned test-name column per file
+        if debug and name_column in df_idx.columns:
+            name_s = df_idx[name_column]
+            if not name_s.index.is_unique:
+                name_s = name_s.groupby(level=key_cols, dropna=False).first()
+            name_s.name = f"{file_label}_name"
+            frames.append(name_s)
+
+        frames.append(s)
+        raw_data_cols.append(file_label)
+        compare_frames.append(s)
+
+        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
         if len(compare_frames) >= 2:
-            # Compare numbers among two files
-            ratio_df = compare_frames[1] / compare_frames[0]
-            frames.append(ratio_df)
-            compare_frames.pop(1)
+            base = compare_frames[0]
+            current = compare_frames[-1]
+            ratio = current / base
+            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
+            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
+            frames.append(ratio)
 
+    # 4) concat on columns with aligned MultiIndex;
+    # then reset_index to return keys as columns
     concat_df = pd.concat(frames, axis=1)
+    concat_df = concat_df.reset_index(drop=True).reset_index()
+    if "index" in concat_df.columns:
+        concat_df = concat_df.drop(columns=["index"])
+
+    # Ensure key/info columns appear first (in your info_cols order)
+    front = [c for c in info_cols if c in concat_df.columns]
+    rest = [c for c in concat_df.columns if c not in front]
+    concat_df = concat_df[front + rest]
+
     print(raw_data_cols)
     return concat_df, raw_data_cols
 
@@ -67,6 +152,15 @@ def split_json_by_tp_pp(
 
     df = pd.DataFrame(data)
 
+    # Keep only "serving" tests
+    name_col = next(
+        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
+    )
+    if name_col:
+        df = df[
+            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
+        ].copy()
+
     # Handle alias column names
     rename_map = {
         "tp_size": "TP Size",
@@ -181,7 +275,6 @@ def split_json_by_tp_pp(
                     f"Expected subset: {filtered_info_cols}, "
                     f"but DataFrame has: {list(output_df.columns)}"
                 )
-
             output_df_sorted = output_df.sort_values(by=existing_group_cols)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:
@@ -189,8 +282,7 @@ def split_json_by_tp_pp(
                 text_file.write(html_msgs_for_data_cols[i])
                 text_file.write(html)
 
-                if plot is True:
-                    import pandas as pd
+                if plot and plotly_found:
                     import plotly.express as px
 
                     df = group[raw_data_cols]

From 4bd618eefff868d20a8d15a6042ae2957c7d71a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EA=B8=B8=EC=9E=AC=EC=9D=80?= <rha3122@naver.com>
Date: Wed, 20 Aug 2025 13:02:50 +0900
Subject: [PATCH 218/233] chore: support pytorch format in lora  (#22790)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: jaeeun.kil <rha3122@naver.com>
Signed-off-by: 길재은 <rha3122@naver.com>
---
 vllm/lora/models.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e6b19d4748f4..3072047a2606 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -207,6 +207,7 @@ def from_local_checkpoint(
         """
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
+        lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
         new_embeddings_tensor_path = os.path.join(
             lora_dir, "new_embeddings.safetensors")
         new_embeddings_bin_file_path = os.path.join(lora_dir,
@@ -255,9 +256,10 @@ def check_unexpected_modules(modules: dict):
                 check_unexpected_modules(f)
                 for module in f.keys():  # noqa
                     tensors[module] = f.get_tensor(module)
-        elif os.path.isfile(lora_bin_file_path):
-            # When a bin file is provided, we rely on config to find unexpected
-            # modules.
+        elif os.path.isfile(lora_bin_file_path) or os.path.isfile(
+                lora_pt_file_path):
+            # When a bin/pt file is provided, we rely on config to find
+            # unexpected modules.
             unexpected_modules = []
             target_modules = peft_helper.target_modules
             if not isinstance(target_modules, list):
@@ -279,7 +281,10 @@ def check_unexpected_modules(modules: dict):
                     f" target modules in {expected_lora_modules}"
                     f" but received {unexpected_modules}."
                     f" Please verify that the loaded LoRA module is correct")
-            tensors = torch.load(lora_bin_file_path,
+            lora_file_path = (lora_bin_file_path
+                              if os.path.isfile(lora_bin_file_path) else
+                              lora_pt_file_path)
+            tensors = torch.load(lora_file_path,
                                  map_location=device,
                                  weights_only=True)
         else:

From 3ccea69202a9dd7f6ddc8314cc058596cb787661 Mon Sep 17 00:00:00 2001
From: Zhewen Li <zhewenli@meta.com>
Date: Tue, 19 Aug 2025 21:09:27 -0700
Subject: [PATCH 219/233] [CI/Build] Also check DP in benchmarks throughput
 script (#23038)

Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 benchmarks/benchmark_throughput.py | 4 ++--
 vllm/benchmarks/throughput.py      | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c51b57968652..c7f290e1eb88 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -597,8 +597,8 @@ def validate_args(args):
     # https://github.com/vllm-project/vllm/issues/16222
     if args.data_parallel_size > 1:
         raise ValueError(
-            "Data parallel is not supported in offline benchmark, \
-            please use benchmark serving instead"
+            "Data parallel is not supported in offline benchmark, "
+            "please use benchmark serving instead"
         )
 
 
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index 0c19fa6dcfdd..f022a55e625f 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -434,6 +434,14 @@ def validate_args(args):
     if args.backend == "mii" and args.tokenizer != args.model:
         raise ValueError(
             "Tokenizer must be the same as the model for MII backend.")
+    
+    # --data-parallel is not supported currently.
+    # https://github.com/vllm-project/vllm/issues/16222
+    if args.data_parallel_size > 1:
+        raise ValueError(
+            "Data parallel is not supported in offline benchmark, "
+            "please use benchmark serving instead"
+        )
 
 
 def add_cli_args(parser: argparse.ArgumentParser):

From dc1351a9e1bccc82bee25ea29cfc71f3e11f01e0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 13:06:42 +0800
Subject: [PATCH 220/233] [CI/Build] Sync multimodal tests (#23181)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      | 10 +++++---
 tests/models/registry.py                      | 24 +++++++++----------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 0fdc182b9ee9..8aa0dc7e8e34 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -275,16 +275,17 @@ def _test_processing_correctness_one(
     "google/gemma-3n-E2B-it",
     "zai-org/glm-4v-9b",
     "zai-org/GLM-4.1V-9B-Thinking",
+    "zai-org/GLM-4.5V",
     "ibm-granite/granite-speech-3.3-2b",
     "h2oai/h2ovl-mississippi-800m",
+    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
     "internlm/Intern-S1",
     "OpenGVLab/InternVL2-1B",
     "OpenGVLab/InternVL3-1B",
-    "HuggingFaceM4/Idefics3-8B-Llama3",
-    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "Kwai-Keye/Keye-VL-8B-Preview",
     "moonshotai/Kimi-VL-A3B-Instruct",
     "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
@@ -315,10 +316,13 @@ def _test_processing_correctness_one(
     "Qwen/Qwen2-Audio-7B-Instruct",
     "Qwen/Qwen2.5-Omni-3B",
     "Skywork/Skywork-R1V-38B",
+    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "stepfun-ai/step3",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
     "omni-research/Tarsier-7b",
     "omni-research/Tarsier2-Recap-7b",
+    "mistralai/Voxtral-Mini-3B-2507",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index cbdc9edbbc9d..28fe9063169e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -215,9 +215,6 @@ def check_available_online(
     "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
                                                 trust_remote_code=True,
                                                 is_available_online=False),
-    "HCXVisionForCausalLM": _HfExamplesInfo(
-        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
-        trust_remote_code=True),
     "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                            trust_remote_code=True),
     "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@@ -298,8 +295,7 @@ def check_available_online(
     "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
     "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3",
-                                            trust_remote_code=True,
-                                            is_available_online=False),
+                                            trust_remote_code=True),
     "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct",
                                         trust_remote_code=True),
     "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@@ -405,22 +401,24 @@ def check_available_online(
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),  # noqa: E501
     "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
-                                          is_available_online=False),   # noqa: E501
+                                                        min_transformers_version="4.56"),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
                                       trust_remote_code=True,
                                       extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
                                       max_transformers_version="4.48",  # noqa: E501
                                       transformers_version_reason="HF model is not compatible."),  # noqa: E501
+    "HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",  # noqa: E501
+                                            trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},    # noqa: E501
                                                         min_transformers_version="4.55.1",
                                                         transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
+    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
+                                                        trust_remote_code=True),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B",
                                                  "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                          trust_remote_code=True),
-    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
-                                         trust_remote_code=True),
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -464,9 +462,10 @@ def check_available_online(
                             transformers_version_reason="HF model is not compatible",  # noqa: E501
                             extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                     "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
-    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True,
-                            max_transformers_version="4.53",
-                            transformers_version_reason="HF model is not compatible"),  # noqa: E501
+    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B",
+                               trust_remote_code=True,
+                               max_transformers_version="4.53",
+                               transformers_version_reason="HF model is not compatible"),  # noqa: E501
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
@@ -496,8 +495,7 @@ def check_available_online(
                                                        min_transformers_version="4.55.1",
                                                        transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
     "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
-                                                        trust_remote_code=True,
-                                                        is_available_online=False),
+                                                        trust_remote_code=True),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
     "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),  # noqa: E501

From 69e35c0144e7a165a2257484533ae5230dd49068 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 19 Aug 2025 22:50:29 -0700
Subject: [PATCH 221/233] [BugFix] Fix stuck stats/metrics after requests are
 aborted (#22995)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py | 95 +++++++++++++++++++++++-
 vllm/v1/core/block_pool.py               |  7 +-
 vllm/v1/core/sched/scheduler.py          |  9 ++-
 3 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 9107d089834b..ff2e7004ff9f 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import asyncio
 import subprocess
 import sys
 import tempfile
@@ -294,6 +294,99 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
             assert metric in response.text
 
 
+@pytest.mark.asyncio
+async def test_abort_metrics_reset(server: RemoteOpenAIServer,
+                                   client: openai.AsyncClient, use_v1: bool):
+
+    running_requests, waiting_requests, kv_cache_usage = (
+        _get_running_metrics_from_api(server))
+
+    # Expect no running requests or kvcache usage
+    assert running_requests == 0
+    assert waiting_requests == 0
+    assert kv_cache_usage == 0.0
+
+    # Start some long-running requests that we can abort
+    tasks = []
+    for _ in range(3):
+        task = asyncio.create_task(
+            client.completions.create(
+                model=MODEL_NAME,
+                prompt=_TOKENIZED_PROMPT,
+                max_tokens=100,  # Long generation to give time to abort
+                temperature=0.0))
+        tasks.append(task)
+
+    # Wait a bit for requests to start processing
+    await asyncio.sleep(0.5)
+
+    # Check that we have running requests
+    running_requests, waiting_requests, kv_cache_usage = (
+        _get_running_metrics_from_api(server))
+
+    # Expect running requests and kvcache usage
+    assert running_requests > 0
+    assert kv_cache_usage > 0
+
+    # Cancel all tasks to abort the requests
+    for task in tasks:
+        task.cancel()
+
+    # Wait for cancellations to be processed
+    await asyncio.sleep(1.0)
+
+    # Check that metrics have reset to zero
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    # Verify running and waiting requests counts and KV cache usage are zero
+    running_requests_after, waiting_requests_after, kv_cache_usage_after = (
+        _get_running_metrics_from_api(server))
+
+    assert running_requests_after == 0,\
+        (f"Expected 0 running requests after abort, got "
+         f"{running_requests_after}")
+    assert waiting_requests_after == 0,\
+        (f"Expected 0 waiting requests after abort, got "
+         f"{waiting_requests_after}")
+    assert kv_cache_usage_after == 0,\
+        (f"Expected 0% KV cache usage after abort, got "
+         f"{kv_cache_usage_after}")
+
+
+def _get_running_metrics_from_api(server: RemoteOpenAIServer):
+    """Return (running_count, waiting_count, kv_cache_usage)"""
+
+    response = requests.get(server.url_for("metrics"))
+    assert response.status_code == HTTPStatus.OK
+
+    # Verify running and waiting requests counts and KV cache usage are zero
+    running_requests, waiting_requests, kv_cache_usage = None, None, None
+
+    for family in text_string_to_metric_families(response.text):
+        if family.name == "vllm:num_requests_running":
+            for sample in family.samples:
+                if sample.name == "vllm:num_requests_running":
+                    running_requests = sample.value
+                    break
+        elif family.name == "vllm:num_requests_waiting":
+            for sample in family.samples:
+                if sample.name == "vllm:num_requests_waiting":
+                    waiting_requests = sample.value
+                    break
+        elif family.name == "vllm:gpu_cache_usage_perc":
+            for sample in family.samples:
+                if sample.name == "vllm:gpu_cache_usage_perc":
+                    kv_cache_usage = sample.value
+                    break
+
+    assert running_requests is not None
+    assert waiting_requests is not None
+    assert kv_cache_usage is not None
+
+    return running_requests, waiting_requests, kv_cache_usage
+
+
 def test_metrics_exist_run_batch(use_v1: bool):
     input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 839297135fe0..fdd96c3e9557 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -298,7 +298,12 @@ def get_usage(self) -> float:
         Returns:
             The KV cache usage (between 0.0 and 1.0).
         """
-        return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
+
+        # Subtract 1 to account for null block.
+        total_gpu_blocks = self.num_gpu_blocks - 1
+        if not total_gpu_blocks:
+            return 0
+        return 1.0 - (self.get_num_free_blocks() / total_gpu_blocks)
 
     def take_events(self) -> list[KVCacheEvent]:
         """Atomically takes all events and clears the queue.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index f9a7e2101407..4b167da5c8f8 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -902,10 +902,13 @@ def update_from_output(
                         finished_requests=finished_set)
             finished_req_ids.clear()
 
-        if engine_core_outputs:
+        if (stats := self.make_stats(spec_decoding_stats)) is not None:
             # Return stats to only one of the front-ends.
-            next(iter(engine_core_outputs.values())).scheduler_stats = (
-                self.make_stats(spec_decoding_stats))
+            if (eco := next(iter(engine_core_outputs.values()), None)) is None:
+                # We must return the stats even if there are no request
+                # outputs this step.
+                engine_core_outputs[0] = eco = EngineCoreOutputs()
+            eco.scheduler_stats = stats
 
         return engine_core_outputs
 

From 507232aba5fa54c02311d9e4251cea4c4c54aaf5 Mon Sep 17 00:00:00 2001
From: who who who <fsx950223@outlook.com>
Date: Wed, 20 Aug 2025 14:24:37 +0800
Subject: [PATCH 222/233] fix cuda graph (#22721)

Signed-off-by: fsx950223 <fsx950223@outlook.com>
---
 vllm/v1/attention/backends/rocm_aiter_fa.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 7d09ac0a4a3a..36b5853bfdcb 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with AiterFlashAttention."""
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 
 import torch
 
@@ -11,7 +11,8 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+                                              AttentionMetadataBuilder,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 
@@ -231,7 +232,7 @@ class AiterFlashAttentionMetadata:
 
 class AiterFlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[AiterFlashAttentionMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = True
+    cudagraph_support = AttentionCGSupport.ALWAYS
 
     def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                  vllm_config: VllmConfig, device: torch.device):

From 431b38092f411beadc94622960200a96582aadb8 Mon Sep 17 00:00:00 2001
From: Calvin Chen <wen.chen@dynamia.ai>
Date: Wed, 20 Aug 2025 18:16:27 +0800
Subject: [PATCH 223/233] [Model] use autoWeightsLoader for gptoss (#22446)

Signed-off-by: calvin chen <wen.chen@dynamia.ai>
---
 vllm/model_executor/models/gpt_oss.py | 432 +++++++++++++-------------
 1 file changed, 224 insertions(+), 208 deletions(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 2f5d9ddd9054..cd93f0ef1e31 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -27,7 +27,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import cdiv
 
-from .utils import extract_layer_index, maybe_prefix
+from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
+                    maybe_prefix)
 
 
 class OAIAttention(nn.Module):
@@ -203,6 +204,7 @@ def __init__(
         super().__init__()
         self.config = vllm_config.model_config.hf_config
         self.quant_config = vllm_config.quant_config
+        self.parallel_config = vllm_config.parallel_config
         self.config.hidden_size = self.config.hidden_size
         self.embedding = VocabParallelEmbedding(
             self.config.vocab_size,
@@ -225,64 +227,26 @@ def forward(self, input_ids: torch.Tensor,
         x = self.norm(x)
         return x
 
-
-class GptOssForCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config.hf_config
-        self.model = GptOssModel(
-            vllm_config=vllm_config,
-            prefix=maybe_prefix(prefix, "model"),
-        )
-        self.lm_head = ParallelLMHead(
-            self.model_config.vocab_size,
-            self.model_config.hidden_size,
-        )
-        self.logits_processor = LogitsProcessor(self.model_config.vocab_size)
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
-        assert intermediate_tensors is None
-        assert inputs_embeds is None
-        return self.model(input_ids, positions)
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
     def _load_weights_mxfp4(
-            self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        rename_mapping = {
-            "self_attn": "attn",
-            "input_layernorm.weight": "attn.norm.weight",
-            "post_attention_layernorm.weight": "mlp.norm.weight",
-            "embed_tokens": "embedding",
-        }
-
-        def maybe_rename(name: str) -> str:
-            for remap_name, new_name in rename_mapping.items():
-                if remap_name in name:
-                    return name.replace(remap_name, new_name)
-            return name
-
+        self,
+        ep_rank_end: int,
+        ep_rank_start: int,
+        heads_per_rank: int,
+        head_start: int,
+        weights: Iterable[tuple[str, torch.Tensor]],
+        stacked_params_mapping: list[tuple[str, ...]],
+    ) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
+
         mxfp4_block = 32
+        use_ep = self.parallel_config.enable_expert_parallel
+        num_experts = self.config.num_local_experts
 
         tp_rank = get_tensor_model_parallel_rank()
         tp_size = get_tensor_model_parallel_world_size()
-        intermediate_size = self.model_config.intermediate_size
+
+        intermediate_size = self.config.intermediate_size
         intermediate_size_block = intermediate_size // mxfp4_block
         per_rank_intermediate_size_block = cdiv(intermediate_size_block,
                                                 tp_size)
@@ -294,33 +258,12 @@ def maybe_rename(name: str) -> str:
         tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
                           intermediate_size)
 
-        # Attention heads per rank
-        heads_per_rank = self.model_config.num_attention_heads // tp_size
-        head_start = tp_rank * heads_per_rank
-
-        use_ep = self.vllm_config.parallel_config.enable_expert_parallel
-        ep_size = get_ep_group().world_size
-        ep_rank = get_ep_group().rank
-        num_experts = self.model_config.num_local_experts
-        experts_per_rank = num_experts // ep_size
-        ep_rank_start = ep_rank * experts_per_rank
-        ep_rank_end = (ep_rank + 1) * experts_per_rank
-
         for name, weight in weights:
             # FIXME(woosuk): Remove this after testing.
             weight = weight.cuda()
 
-            if "gate_up_proj_blocks" in name:
-                # Handle MLP gate and up projection weights
-                new_name = name.replace("gate_up_proj_blocks", "w13_weight")
-
-                # flat weight from (E, 2 * N, block_size, entry_per_block)
-                # to (E, 2 * N, -1), shouldn't trigger copy for contiguous
-                weight = weight.view(num_experts, 2 * intermediate_size,
-                                     -1).contiguous()
-
-                # Extract gate and up projection parts
-                # since the weight is shuffled, we can slice directly
+            if ".w13_weight_scale" in name:
+                # Handle MLP gate and up projection weights scale
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
@@ -328,43 +271,44 @@ def maybe_rename(name: str) -> str:
                                            2 * tp_rank_start:2 * tp_rank_end,
                                            ...]
 
-                param = params_dict[new_name]
+                param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param,
                               narrow_weight,
-                              weight_name=new_name,
+                              weight_name=name,
                               shard_id=None,
                               expert_id=None)
-                loaded_params.add(new_name)
-
-            elif "down_proj_blocks" in name:
+                loaded_params.add(name)
+                continue
+            elif ".w2_weight_scale" in name:
                 # Handle MLP down projection weights
-                new_name = name.replace("down_proj_blocks", "w2_weight")
-                # same flatten here, but since 2 mx4 value are packed in 1
-                # uint8, divide by 2
-                weight = weight.view(num_experts, -1,
-                                     intermediate_size // 2).contiguous()
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[...,
-                                           tp_rank_start // 2:tp_rank_end // 2]
+                    narrow_weight = weight[..., tp_rank_start //
+                                           mxfp4_block:tp_rank_end //
+                                           mxfp4_block]
 
-                param = params_dict[new_name]
+                param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param,
                               narrow_weight,
-                              weight_name=new_name,
+                              weight_name=name,
                               shard_id=None,
                               expert_id=None)
-                loaded_params.add(new_name)
+                loaded_params.add(name)
+                continue
+            elif ".w13_weight" in name:
+                # Handle MLP gate and up projection weights
+                # flat weight from (E, 2 * N, block_size, entry_per_block)
+                # to (E, 2 * N, -1), shouldn't trigger copy for contiguous
+                weight = weight.view(num_experts, 2 * intermediate_size,
+                                     -1).contiguous()
 
-            elif "gate_up_proj_scales" in name:
-                # Handle MLP gate and up projection weights scale
-                new_name = name.replace("gate_up_proj_scales",
-                                        "w13_weight_scale")
+                # Extract gate and up projection parts
+                # since the weight is shuffled, we can slice directly
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
@@ -372,39 +316,40 @@ def maybe_rename(name: str) -> str:
                                            2 * tp_rank_start:2 * tp_rank_end,
                                            ...]
 
-                param = params_dict[new_name]
+                param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param,
                               narrow_weight,
-                              weight_name=new_name,
+                              weight_name=name,
                               shard_id=None,
                               expert_id=None)
-                loaded_params.add(new_name)
-
-            elif "down_proj_scales" in name:
+                loaded_params.add(name)
+                continue
+            elif ".w2_weight" in name:
                 # Handle MLP down projection weights
-                new_name = name.replace("down_proj_scales", "w2_weight_scale")
+                # same flatten here, but since 2 mx4 value are packed in 1
+                # uint8, divide by 2
+                weight = weight.view(num_experts, -1,
+                                     intermediate_size // 2).contiguous()
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
-                    narrow_weight = weight[..., tp_rank_start //
-                                           mxfp4_block:tp_rank_end //
-                                           mxfp4_block]
+                    narrow_weight = weight[...,
+                                           tp_rank_start // 2:tp_rank_end // 2]
 
-                param = params_dict[new_name]
+                param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param,
                               narrow_weight,
-                              weight_name=new_name,
+                              weight_name=name,
                               shard_id=None,
                               expert_id=None)
-                loaded_params.add(new_name)
-            elif "gate_up_proj_bias" in name:
+                loaded_params.add(name)
+                continue
+            elif ".w13_bias" in name:
                 # Handle MLP gate and up projection biases
-                new_name = name.replace("gate_up_proj_bias", "w13_bias")
-
                 # Extract gate and up projection bias parts
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
@@ -412,20 +357,19 @@ def maybe_rename(name: str) -> str:
                     narrow_weight = weight[:,
                                            2 * tp_rank_start:2 * tp_rank_end]
 
-                param = params_dict[new_name]
+                param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param,
                               narrow_weight,
-                              weight_name=new_name,
+                              weight_name=name,
                               shard_id=None,
                               expert_id=None)
-                loaded_params.add(new_name)
-
-            elif "down_proj_bias" in name:
+                loaded_params.add(name)
+                continue
+            elif ".w2_bias" in name:
                 # Handle MLP down projection bias
-                new_name = name.replace("down_proj_bias", "w2_bias")
-                param = params_dict[new_name]
+                param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 if use_ep:
@@ -436,87 +380,69 @@ def maybe_rename(name: str) -> str:
                         weight.zero_()
                 weight_loader(param,
                               weight,
-                              weight_name=new_name,
+                              weight_name=name,
                               shard_id=None,
                               expert_id=None)
-                loaded_params.add(new_name)
+                loaded_params.add(name)
+                continue
             elif "sinks" in name:
                 # Handle attention sinks (distributed across ranks)
-                name = name.replace("self_attn", "attn")
                 param = params_dict[name]
                 narrow_weight = weight.narrow(0, head_start, heads_per_rank)
                 param.data.copy_(narrow_weight)
                 loaded_params.add(name)
-            elif "q_proj" in name or "k_proj" in name or "v_proj" in name:
-                shard_id = ("q" if "q_proj" in name else
-                            "k" if "k_proj" in name else "v")
-                name = name.replace("self_attn", "attn")
-                param_name = name.replace(f"{shard_id}_proj", "qkv")
-                param = params_dict[param_name]
-                weight_loader = param.weight_loader
-                weight_loader(param, weight, loaded_shard_id=shard_id)
-                loaded_params.add(param_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, weight)
+                else:
+                    weight_loader(param, weight, shard_id)
+                break
             else:
                 # Handle all other weights with potential renaming
-                renamed_name = maybe_rename(name)
-                if renamed_name not in params_dict:
+                if name not in params_dict:
                     continue
-                param = params_dict[renamed_name]
+                param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, weight)
-                loaded_params.add(renamed_name)
-
+            loaded_params.add(name)
         return loaded_params
 
     def _load_weights_other(
-            self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        rename_mapping = {
-            "self_attn": "attn",
-            "input_layernorm.weight": "attn.norm.weight",
-            "post_attention_layernorm.weight": "mlp.norm.weight",
-            "embed_tokens": "embedding",
-        }
-
-        def maybe_rename(name: str) -> str:
-            for remap_name, new_name in rename_mapping.items():
-                if remap_name in name:
-                    return name.replace(remap_name, new_name)
-            return name
-
+        self,
+        ep_rank_start: int,
+        ep_rank_end: int,
+        heads_per_rank: int,
+        head_start: int,
+        weights: Iterable[tuple[str, torch.Tensor]],
+        stacked_params_mapping: list[tuple[str, ...]],
+    ) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
 
+        use_ep = self.parallel_config.enable_expert_parallel
+
         tp_rank = get_tensor_model_parallel_rank()
         tp_size = get_tensor_model_parallel_world_size()
-        intermediate_size = self.model_config.intermediate_size
 
+        intermediate_size = self.config.intermediate_size
         per_rank_intermediate_size = cdiv(intermediate_size, tp_size)
         # Calculate common slicing bounds for current rank
         tp_rank_start = tp_rank * per_rank_intermediate_size
         tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size,
                           intermediate_size)
 
-        # Attention heads per rank
-        heads_per_rank = self.model_config.num_attention_heads // tp_size
-        head_start = tp_rank * heads_per_rank
-
-        use_ep = self.vllm_config.parallel_config.enable_expert_parallel
-        ep_size = get_ep_group().world_size
-        ep_rank = get_ep_group().rank
-        num_experts = self.model_config.num_local_experts
-        experts_per_rank = num_experts // ep_size
-        ep_rank_start = ep_rank * experts_per_rank
-        ep_rank_end = (ep_rank + 1) * experts_per_rank
-
         for name, weight in weights:
-            if ".experts.gate_up_proj" in name and "bias" not in name:
+            if ".w13_weight" in name:
                 # Handle MLP gate and up projection weights
-                new_name = name.replace(".experts.gate_up_proj",
-                                        ".experts.w13_weight")
-
                 # Extract gate and up projection parts
-                # since the weight is shuffled, we can slice directly
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
@@ -524,30 +450,25 @@ def maybe_rename(name: str) -> str:
                                            2 * tp_rank_start:2 * tp_rank_end]
 
                 narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
-                param = params_dict[new_name]
+                param = params_dict[name]
 
                 param.copy_(narrow_weight)
-                loaded_params.add(new_name)
-
-            elif ".experts.down_proj" in name and "bias" not in name:
+                loaded_params.add(name)
+                continue
+            elif ".w2_weight" in name:
                 # Handle MLP down projection weights
-                new_name = name.replace(".experts.down_proj",
-                                        ".experts.w2_weight")
-
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
                     narrow_weight = weight[:, tp_rank_start:tp_rank_end, :]
                 narrow_weight = narrow_weight.permute(0, 2, 1).contiguous()
-                param = params_dict[new_name]
+                param = params_dict[name]
 
                 param.copy_(narrow_weight)
-                loaded_params.add(new_name)
-
-            elif "gate_up_proj_bias" in name:
+                loaded_params.add(name)
+                continue
+            elif ".w13_bias" in name:
                 # Handle MLP gate and up projection biases
-                new_name = name.replace("gate_up_proj_bias", "w13_bias")
-
                 # Extract gate and up projection bias parts
                 if use_ep:
                     narrow_weight = weight[ep_rank_start:ep_rank_end, ...]
@@ -555,60 +476,155 @@ def maybe_rename(name: str) -> str:
                     narrow_weight = weight[:,
                                            2 * tp_rank_start:2 * tp_rank_end]
 
-                param = params_dict[new_name]
-
+                param = params_dict[name]
                 param.copy_(narrow_weight)
-                loaded_params.add(new_name)
-
-            elif "down_proj_bias" in name:
+                loaded_params.add(name)
+                continue
+            elif ".w2_bias" in name:
                 # Handle MLP down projection bias
-                new_name = name.replace("down_proj_bias", "w2_bias")
-
                 if use_ep:
                     weight = weight[ep_rank_start:ep_rank_end, ...]
                 else:
                     # (only load on rank 0 to avoid duplication)
                     if tp_rank != 0:
                         weight.zero_()
-                param = params_dict[new_name]
+                param = params_dict[name]
                 param.copy_(weight)
-                loaded_params.add(new_name)
+                loaded_params.add(name)
+                continue
             elif "sinks" in name:
                 # Handle attention sinks (distributed across ranks)
-                name = name.replace("self_attn", "attn")
                 param = params_dict[name]
                 narrow_weight = weight.narrow(0, head_start, heads_per_rank)
                 param.data.copy_(narrow_weight)
                 loaded_params.add(name)
-            elif "q_proj" in name or "k_proj" in name or "v_proj" in name:
-                shard_id = ("q" if "q_proj" in name else
-                            "k" if "k_proj" in name else "v")
-                name = name.replace("self_attn", "attn")
-                param_name = name.replace(f"{shard_id}_proj", "qkv")
-                param = params_dict[param_name]
-                weight_loader = param.weight_loader
-                weight_loader(param, weight, loaded_shard_id=shard_id)
-                loaded_params.add(param_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, weight)
+                else:
+                    weight_loader(param, weight, shard_id)
+                break
             else:
                 # Handle all other weights with potential renaming
-
-                renamed_name = maybe_rename(name)
-                if renamed_name not in params_dict:
+                if name not in params_dict:
                     continue
-                param = params_dict[renamed_name]
+                param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, weight)
-                loaded_params.add(renamed_name)
-
+            loaded_params.add(name)
         return loaded_params
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        quant_method = (self.model_config.quantization_config['quant_method']
-                        if hasattr(self.model_config, "quantization_config")
-                        else None)
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv", ".q_proj", "q"),
+            (".qkv", ".k_proj", "k"),
+            (".qkv", ".v_proj", "v"),
+        ]
+
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # Attention heads per rank
+        heads_per_rank = self.config.num_attention_heads // tp_size
+        head_start = tp_rank * heads_per_rank
+
+        ep_size = get_ep_group().world_size
+        ep_rank = get_ep_group().rank
+        num_experts = self.config.num_local_experts
+        experts_per_rank = num_experts // ep_size
+        ep_rank_start = ep_rank * experts_per_rank
+        ep_rank_end = (ep_rank + 1) * experts_per_rank
+
+        quant_method = (self.config.quantization_config['quant_method'] if
+                        hasattr(self.config, "quantization_config") else None)
         if quant_method == "mxfp4":
-            return self._load_weights_mxfp4(weights)
+            return self._load_weights_mxfp4(ep_rank_end, ep_rank_start,
+                                            heads_per_rank, head_start,
+                                            weights, stacked_params_mapping)
         else:
-            return self._load_weights_other(weights)
+            return self._load_weights_other(ep_rank_end, ep_rank_start,
+                                            heads_per_rank, head_start,
+                                            weights, stacked_params_mapping)
+
+
+class GptOssForCausalLM(nn.Module):
+    packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            ".self_attn.": ".attn.",
+            ".post_attention_layernorm.": ".mlp.norm.",
+        },
+        orig_to_new_suffix={
+            ".embed_tokens.weight": ".embedding.weight",
+            ".input_layernorm.weight": ".attn.norm.weight",
+            ".post_attention_layernorm.weight": ".mlp.norm.weight",
+
+            # MoE MXFP4 weights
+            ".gate_up_proj_blocks": ".w13_weight",
+            ".down_proj_blocks": ".w2_weight",
+            ".gate_up_proj_scales": ".w13_weight_scale",
+            ".down_proj_scales": ".w2_weight_scale",
+
+            # MoE other weights
+            ".gate_up_proj": ".w13_weight",
+            ".down_proj": ".w2_weight",
+
+            # MoE Bias
+            ".gate_up_proj_bias": ".w13_bias",
+            ".down_proj_bias": ".w2_bias",
+        },
+    )
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.vllm_config = vllm_config
+        self.config = vllm_config.model_config.hf_config
+
+        self.model = GptOssModel(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+        )
+        self.logits_processor = LogitsProcessor(self.config.vocab_size)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
+        assert intermediate_tensors is None
+        assert inputs_embeds is None
+        return self.model(input_ids, positions)
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 8a60206efae21aabb668f48404def1faac892862 Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@hotmail.com>
Date: Wed, 20 Aug 2025 18:46:59 +0800
Subject: [PATCH 224/233] Fix missing quotes (#23242)

Signed-off-by: Shiming Zhang <wzshiming@hotmail.com>
---
 docs/deployment/frameworks/dstack.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md
index 23dc58c974ed..fe4d87f78f2a 100644
--- a/docs/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -9,7 +9,7 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/),
 To install dstack client, run:
 
 ```bash
-pip install "dstack[all]
+pip install dstack[all]
 dstack server
 ```
 

From f5d48f804794775b573ba69880f24ba74abe1502 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Wed, 20 Aug 2025 04:01:31 -0700
Subject: [PATCH 225/233] [Model] Support deepseek with eagle (#21086)

Signed-off-by: Xin Yang <xyangx@amazon.com>
---
 tests/models/registry.py                     |   3 +
 tests/v1/e2e/test_spec_decode.py             |   6 +-
 vllm/model_executor/models/deepseek_eagle.py | 246 +++++++++++++++++++
 vllm/model_executor/models/registry.py       |   1 +
 4 files changed, 255 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/deepseek_eagle.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 28fe9063169e..739d96227971 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -530,6 +530,9 @@ def check_available_online(
     "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
                                         speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
                                         trust_remote_code=True),
+    "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random",
+                                        speculative_model="eagle618/eagle-deepseek-v3-random",  # noqa: E501
+                                        trust_remote_code=True),
     "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 7b3f45831279..bd0fa6b80781 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -144,6 +144,8 @@ def test_ngram_correctness(
              "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
             True,
             marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+        (("eagle", "eagle618/deepseek-v3-random",
+          "eagle618/eagle-deepseek-v3-random", 1), False),
     ],
     ids=[
         # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
@@ -151,7 +153,8 @@ def test_ngram_correctness(
         "llama3_eagle",
         "llama3_eagle3",
         "llama4_eagle",
-        "llama4_eagle_mm"
+        "llama4_eagle_mm",
+        "deepseek_eagle"
     ])
 @pytest.mark.parametrize("attn_backend",
                          get_attn_backend_list_based_on_platform())
@@ -177,6 +180,7 @@ def test_eagle_correctness(
     '''
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_MLA_DISABLE", "1")
         m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
 
         if (attn_backend == "TRITON_ATTN_VLLM_V1"
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
new file mode 100644
index 000000000000..0c9c83cf6100
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.deepseek_v2 import (DeepseekV2DecoderLayer,
+                                                    DeepseekV3ForCausalLM)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+from .utils import AutoWeightsLoader, maybe_prefix
+
+
+@support_torch_compile
+class DeepseekV2Model(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        start_layer_id: int = 0,
+    ) -> None:
+        super().__init__()
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.vocab_size = self.config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "embed_tokens"),
+        )
+
+        self.layers = nn.ModuleList([
+            DeepseekV2DecoderLayer(
+                self.config,
+                prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"),
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+            ) for i in range(self.config.num_hidden_layers)
+        ])
+
+        self.fc = nn.Linear(
+            self.config.model.hidden_size * 2,
+            self.config.model.hidden_size,
+            bias=False,
+        )
+
+        self.enorm = RMSNorm(self.config.hidden_size,
+                             eps=self.config.rms_norm_eps)
+        self.hnorm = RMSNorm(self.config.hidden_size,
+                             eps=self.config.rms_norm_eps)
+        self.norm = RMSNorm(self.config.hidden_size,
+                            eps=self.config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_embeds = self.embed_tokens(input_ids)
+
+        inputs = torch.cat(
+            [self.enorm(input_embeds),
+             self.hnorm(hidden_states)], dim=-1)
+        hidden_states = self.fc(inputs)
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states, hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name_mapped = name.replace(weight_name, param_name)
+
+                # QKV fusion is optional, fall back to normal
+                # weight loading if it's not enabled
+                # if go with fusion option, then update name
+                if ((param_name == "fused_qkv_a_proj")
+                        and name_mapped not in params_dict):
+                    continue
+                else:
+                    name = name_mapped
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # if PP disabled then draft will share embed with target
+                    if get_pp_group().world_size == 1 and \
+                            "embed_tokens." in name:
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class EagleDeepseekV3ForCausalLM(DeepseekV3ForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        self.config = vllm_config. \
+            speculative_config.draft_model_config.hf_config
+        quant_config = vllm_config.quant_config
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
+        self.model = DeepseekV2Model(vllm_config=vllm_config,
+                                     prefix="model",
+                                     start_layer_id=target_layer_num)
+
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size,
+                                      quant_config=quant_config)
+
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.config.vocab_size,
+                                                scale=logit_scale)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                f"{type(self).__name__} does not support multimodal inputs yet."
+            )
+        return self.model(input_ids, positions, hidden_states)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=None,
+        )
+
+        model_weights = {}
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+            model_weights[name] = loaded_weight
+        loader.load_weights(model_weights.items())
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8728684d8e68..a94231b0f846 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -264,6 +264,7 @@
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
     # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),

From 2f31e734009eb9c42f6f3f54343a87d568bfac03 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 19:09:18 +0800
Subject: [PATCH 226/233] [Bugfix] Ensure correctness of Cohere2Vision
 processing (#23245)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      |  1 +
 vllm/model_executor/models/aya_vision.py      |  3 +-
 vllm/model_executor/models/cohere2_vision.py  | 71 ++++++++++++++-----
 3 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 8aa0dc7e8e34..d5b1de834a61 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -268,6 +268,7 @@ def _test_processing_correctness_one(
     "CohereForAI/aya-vision-8b",
     "Salesforce/blip2-opt-2.7b",
     "facebook/chameleon-7b",
+    "CohereLabs/command-a-vision-07-2025",
     "deepseek-ai/deepseek-vl2-tiny",
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index b02a973d942c..687c82ded9d0 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -250,8 +250,7 @@ def _get_prompt_updates(
         image_processor = hf_processor.image_processor
 
         def get_replacement(item_idx: int):
-            images: ImageProcessorItems = mm_items.get("image",
-                                                       ImageProcessorItems)
+            images = mm_items.get_items("image", ImageProcessorItems)
             image_size: ImageSize = images.get_image_size(item_idx)
             num_patches = self.info.get_num_patches(
                 image_width=image_size.width,
diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py
index bc526fd661b6..4682a8a428a0 100644
--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -10,6 +10,8 @@
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 from transformers.models.cohere2_vision import Cohere2VisionConfig
+from transformers.models.cohere2_vision.image_processing_cohere2_vision_fast import (  # noqa: E501
+    get_optimal_tiled_canvas)
 from transformers.models.cohere2_vision.processing_cohere2_vision import (
     Cohere2VisionProcessor)
 
@@ -150,14 +152,46 @@ def get_image_size_with_most_features(self) -> ImageSize:
         max_patches = image_processor.max_patches
         return ImageSize(height=height * max_patches, width=width)
 
-    def get_num_patches(self, image_width: int, image_height: int) -> int:
+    def get_num_patches(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Cohere2VisionProcessor],
+    ) -> int:
         """
         Calculate the number of image patches for a given image.
         Uses the HF processor to determine the actual number of patches.
         """
-        return self.get_hf_processor(
-        ).image_processor.get_number_of_image_patches(image_height,
-                                                      image_width, {})
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_processor = processor.image_processor
+
+        # The current implementation of get_number_of_image_patches
+        # is incorrect, so we patch it here.
+        # return image_processor.get_number_of_image_patches(image_height,
+        #                                                    image_width, {})
+
+        min_patches = image_processor.min_patches
+        max_patches = image_processor.max_patches
+        patch_size = image_processor.size
+        crop_to_patches = image_processor.crop_to_patches
+
+        if not crop_to_patches:
+            return 1
+
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (image_height, image_width),
+            (patch_size["height"], patch_size["width"]),
+            min_patches,
+            max_patches,
+        )
+        num_patches = num_columns * num_rows
+        if num_patches > 1:
+            num_patches += 1  # Thumbnail image
+
+        return num_patches
 
 
 class Cohere2VisionDummyInputsBuilder(
@@ -208,6 +242,8 @@ def _call_hf_processor(
         # Ensure num_patches is available for proper tensor splitting
         if "num_patches" not in processed_outputs and (
                 images := mm_data.get("images")) is not None:
+            hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
             # Fallback calculation if HF processor didn't provide num_patches
             parsed_images = self._get_data_parser().parse_mm_data({
                 "image":
@@ -217,8 +253,9 @@ def _call_hf_processor(
             num_patches = [
                 self.info.get_num_patches(
                     image_width=parsed_images.get_image_size(i).width,
-                    image_height=parsed_images.get_image_size(i).height)
-                for i in range(len(parsed_images))
+                    image_height=parsed_images.get_image_size(i).height,
+                    processor=hf_processor,
+                ) for i in range(len(parsed_images))
             ]
             processed_outputs["num_patches"] = torch.tensor(num_patches)
 
@@ -245,25 +282,25 @@ def _get_prompt_updates(
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
+        img_tokens_per_tile = int(hf_processor.patch_size**2)
         img_line_break_token = hf_processor.img_line_break_token
         boi_token = hf_processor.boi_token
         eoi_token = hf_processor.eoi_token
 
         def get_replacement(item_idx: int):
-            images: ImageProcessorItems = mm_items.get("image",
-                                                       ImageProcessorItems)
+            images = mm_items.get_items("image", ImageProcessorItems)
             image_size: ImageSize = images.get_image_size(item_idx)
 
-            num_patches = self.info.get_num_patches(image_size.height,
-                                                    image_size.width)
-            img_tokens_per_tile = int(hf_processor.patch_size**2)
-            single_tile_tokens = image_token * img_tokens_per_tile + \
-                img_line_break_token
-            img_string = f"{boi_token}\
-                {single_tile_tokens * num_patches}\
-                {eoi_token}"
+            num_patches = self.info.get_num_patches(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                processor=hf_processor,
+            )
+            patch_tokens = (image_token * img_tokens_per_tile +
+                            img_line_break_token)
+            repl = f"{boi_token}{patch_tokens * num_patches}{eoi_token}"
 
-            return PromptUpdateDetails.select_text(img_string, image_token)
+            return PromptUpdateDetails.select_text(repl, image_token)
 
         return [
             PromptReplacement(

From d9d4f404ee94e119782e269de02ffb42a7dd89e2 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 20 Aug 2025 08:05:54 -0400
Subject: [PATCH 227/233] Update to flashinfer-python==0.2.12 and disable AOT
 compile for non-release image (#23129)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/release-pipeline.yaml |  2 +-
 docker/Dockerfile                | 52 ++++++++++++++++++++------------
 setup.py                         |  2 +-
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 85d3e5638742..e20ce54ca795 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -68,7 +68,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Annotate release workflow"
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 74938917781a..cfaa59868215 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -372,31 +372,45 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
-# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
-ARG FLASHINFER_GIT_REF="v0.2.11"
+# Keep this in sync with "flashinfer" extra in setup.py
+ARG FLASHINFER_GIT_REF="v0.2.12"
+# Flag to control whether to compile FlashInfer AOT kernels
+# Set to "true" to enable AOT compilation:
+# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
+ARG FLASHINFER_AOT_COMPILE=false
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
   . /etc/environment
     git clone --depth 1 --recursive --shallow-submodules \
         --branch ${FLASHINFER_GIT_REF} \
         ${FLASHINFER_GIT_REPO} flashinfer
-    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-    if [[ "${CUDA_VERSION}" == 11.* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-    else
-        # CUDA 12.8+ supports 10.0a and 12.0
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
-    fi
-    echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
-    # Needed to build AOT kernels
     pushd flashinfer
-        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            python3 -m flashinfer.aot
-        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            uv pip install --system --no-build-isolation --force-reinstall --no-deps .
+        if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
+            # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+            # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+            if [[ "${CUDA_VERSION}" == 11.* ]]; then
+                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+            elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+            else
+                # CUDA 12.8+ supports 10.0a and 12.0
+                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+            fi
+            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+            # Build AOT kernels
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                python3 -m flashinfer.aot
+            # Install with no-build-isolation since we already built AOT kernels
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                uv pip install --system --no-build-isolation . \
+                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+            # Download pre-compiled cubins
+            TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
+        else
+            echo "🏗️  Installing FlashInfer without AOT compilation in JIT mode"
+            uv pip install --system . \
+                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+        fi
     popd
     rm -rf flashinfer
 BASH
diff --git a/setup.py b/setup.py
index cc3037ebb72c..6a3013de7937 100644
--- a/setup.py
+++ b/setup.py
@@ -685,7 +685,7 @@ def _read_requirements(filename: str) -> list[str]:
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.11"],
+        "flashinfer": ["flashinfer-python==0.2.12"],
     },
     cmdclass=cmdclass,
     package_data=package_data,

From 86b2c91d2deebc6296ccab1a9dace981d8caa7b4 Mon Sep 17 00:00:00 2001
From: xyxinyang <43821961+xyxinyang@users.noreply.github.com>
Date: Wed, 20 Aug 2025 20:41:55 +0800
Subject: [PATCH 228/233] [Model][V1] Support Ernie MTP (#22169)

Signed-off-by: zhouchong <zhouchong03@baidu.com>
Co-authored-by: zhouchong <zhouchong03@baidu.com>
---
 tests/models/registry.py                |   3 +
 vllm/config/__init__.py                 |  31 ++-
 vllm/model_executor/models/ernie_mtp.py | 287 ++++++++++++++++++++++++
 vllm/model_executor/models/registry.py  |   1 +
 vllm/v1/spec_decode/eagle.py            |   2 +-
 vllm/worker/worker.py                   |   3 +-
 6 files changed, 320 insertions(+), 7 deletions(-)
 create mode 100644 vllm/model_executor/models/ernie_mtp.py

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 739d96227971..6e6acfb8cd22 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -556,6 +556,9 @@ def check_available_online(
                                             is_available_online=False,
                                             speculative_model="openbmb/MiniCPM-2B-sft-bf16",
                                             tokenizer="openbmb/MiniCPM-2B-sft-bf16"),
+    "ErnieMTPModel": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT",
+                                    trust_remote_code=True,
+                                    speculative_model="baidu/ERNIE-4.5-21B-A3B-PT"),
     "Glm4MoeMTPModel": _HfExamplesInfo("zai-org/GLM-4.5",
                                         speculative_model="zai-org/GLM-4.5",
                                         min_transformers_version="4.54",
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 56a749789b6a..801fa97fe5da 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1463,7 +1463,8 @@ def get_layers_start_end_indices(
         from vllm.distributed.utils import get_pp_indices
         if (self.hf_text_config.model_type == "deepseek_mtp"
                 or self.hf_config.model_type == "mimo_mtp"
-                or self.hf_config.model_type == "glm4_moe_mtp"):
+                or self.hf_config.model_type == "glm4_moe_mtp"
+                or self.hf_config.model_type == "ernie_mtp"):
             total_num_hidden_layers = getattr(self.hf_text_config,
                                               "num_nextn_predict_layers", 0)
         else:
@@ -1911,7 +1912,8 @@ def __post_init__(self):
 
 
 SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
-                            "mlp_speculator", "draft_model", "deepseek_mtp"]
+                            "mlp_speculator", "draft_model", "deepseek_mtp",
+                            "ernie_mtp"]
 
 
 @config
@@ -2044,6 +2046,16 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
                 "architectures": ["Glm4MoeMTPModel"]
             })
 
+        if hf_config.model_type == "ernie4_5_moe":
+            hf_config.model_type = "ernie_mtp"
+        if hf_config.model_type == "ernie_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({
+                "n_predict": n_predict,
+                "architectures": ["ErnieMTPModel"]
+            })
+            return hf_config
+
         return hf_config
 
     def __post_init__(self):
@@ -2062,8 +2074,8 @@ def __post_init__(self):
             if self.target_model_config and \
                 (self.target_model_config.hf_text_config.model_type \
                         == "deepseek_v3" or
-                    self.target_model_config.hf_text_config.model_type \
-                        == "mimo"):
+                    self.target_model_config.hf_text_config.model_type in
+                        ("mimo","ernie4_5_moe")):
                 # use the draft model from the same model:
                 self.model = self.target_model_config.model
             elif self.method in ("ngram", "[ngram]"):
@@ -2161,6 +2173,15 @@ def __post_init__(self):
                                 "one layer. Might need some code changes " \
                                 "to support multiple layers."
                             )
+                elif (self.draft_model_config.hf_config.model_type ==
+                      "ernie_mtp"):
+                    self.method = "ernie_mtp"
+                    if self.num_speculative_tokens > 1:
+                        logger.warning(
+                                "All Ernie MTP models only have " \
+                                "one layer. Might need some code changes " \
+                                "to support multiple layers."
+                            )
                 else:
                     self.method = "draft_model"
                     raise NotImplementedError(
@@ -2376,7 +2397,7 @@ def num_lookahead_slots(self) -> int:
         return self.num_speculative_tokens
 
     def use_eagle(self) -> bool:
-        return self.method in ("eagle", "eagle3", "deepseek_mtp")
+        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp")
 
     def __repr__(self) -> str:
         method = self.method
diff --git a/vllm/model_executor/models/ernie_mtp.py b/vllm/model_executor/models/ernie_mtp.py
new file mode 100644
index 000000000000..90a1267b28f0
--- /dev/null
+++ b/vllm/model_executor/models/ernie_mtp.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Ernie-MTP model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsPP
+from .llama import LlamaDecoderLayer
+from .utils import is_pp_missing_parameter, maybe_prefix
+
+
+class ErnieMultiTokenPredictorLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.mtp_emb_norm = RMSNorm(config.hidden_size,
+                                    eps=config.rms_norm_eps)
+        self.mtp_hidden_norm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.mtp_linear_proj = nn.Linear(config.hidden_size * 2,
+                                         config.hidden_size,
+                                         bias=False)
+        self.mtp_block = LlamaDecoderLayer(config, cache_config, quant_config,
+                                           prefix)
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        spec_step_index: int = 0,
+    ) -> torch.Tensor:
+        assert inputs_embeds is not None
+        # masking inputs at position 0, as not needed by MTP
+        inputs_embeds[positions == 0] = 0
+
+        inputs_embeds = self.mtp_emb_norm(inputs_embeds)
+        previous_hidden_states = self.mtp_hidden_norm(previous_hidden_states)
+
+        hidden_states = self.mtp_linear_proj(
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+
+        hidden_states, residual = self.mtp_block(positions=positions,
+                                                 hidden_states=hidden_states,
+                                                 residual=None)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class ErnieMultiTokenPredictor(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+        # to map the exact layer index from weights
+        self.layers = torch.nn.ModuleDict({
+            str(idx):
+            ErnieMultiTokenPredictorLayer(
+                config,
+                f"{prefix}.layers.{idx}",
+                model_config=vllm_config.model_config,
+                cache_config=vllm_config.cache_config,
+            )
+            for idx in range(self.mtp_start_layer_idx,
+                             self.mtp_start_layer_idx + self.num_mtp_layers)
+        })
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)](
+            inputs_embeds,
+            positions,
+            previous_hidden_states,
+            spec_step_idx,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: ParallelLMHead,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]
+        logits = self.logits_processor(lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+
+class ErnieMTP(nn.Module, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        self.config = vllm_config.model_config.hf_config
+        self.model = ErnieMultiTokenPredictor(vllm_config=vllm_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "model"))
+        self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                      self.config.hidden_size)
+        self.sampler = get_sampler()
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        assert spec_step_idx == 0, "ernie_mtp only support predict one token"
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> Optional[torch.Tensor]:
+        return self.model.compute_logits(hidden_states, self.lm_head,
+                                         sampling_metadata, spec_step_idx)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+
+            if self.config.tie_word_embeddings and name.endswith(
+                    "lm_head.weight"):
+                continue
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "mtp" in name:
+                name = self._rewrite_spec_layer_name(self.config, name)
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                if "mtp" not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                # According to DeepSeek-V3 Technical Report, MTP modules
+                # shares embedding layer. We only load the first weights.
+                if "mtp_" not in name and ("embed_tokens" not in name
+                                           and "lm_head" not in name):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def _rewrite_spec_layer_name(self, config: PretrainedConfig,
+                                 name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        """
+        spec_layer_weight_names = [
+            "embed_tokens", "mtp_emb_norm", "mtp_hidden_norm",
+            "mtp_linear_proj"
+        ]
+        layer_idx = config.num_hidden_layers
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                name = name.replace(
+                    f"model.{weight_name}.0.",
+                    f"model.layers.{layer_idx}.{weight_name}.")
+                return name
+        name = name.replace("model.mtp_block.0.",
+                            f"model.layers.{layer_idx}.mtp_block.")
+        return name
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index a94231b0f846..78ef270598b8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -266,6 +266,7 @@
     # "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
+    "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
     "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
     "MedusaModel": ("medusa", "Medusa"),
     # Temporarily disabled.
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index a8a160a0f995..8cd2ad12cfa3 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -194,7 +194,7 @@ def propose(
                 hidden_states=self.hidden_states[:num_input_tokens],
                 inputs_embeds=inputs_embeds,
             )
-            if self.method == "deepseek_mtp":
+            if self.method in ("deepseek_mtp", "ernie_mtp"):
                 last_hidden_states = ret_hidden_states
             else:
                 last_hidden_states, hidden_states = ret_hidden_states
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 9dfea947568d..7a01e585ba6d 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -77,7 +77,8 @@ def __init__(
                         "eagle",
                         "deepseek_mtp",
                         "glm4_moe_mtp",
-                        "mimo_mtp")) \
+                        "mimo_mtp",
+                        "ernie_mtp")) \
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner

From 5cbe9c26548b9a365b0af53934286a8c15c4659e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 20 Aug 2025 20:47:05 +0800
Subject: [PATCH 229/233] [Model] Improve olmo and olmo2 (#23228)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/models/supported_models.md     |  4 ++--
 vllm/model_executor/models/olmo.py  | 22 +++++++++++++++++++---
 vllm/model_executor/models/olmo2.py | 17 +++++++++++++++--
 3 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 7908e4238710..7308d0010690 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -384,8 +384,8 @@ th {
 | `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | ✅︎ |
 | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | ✅︎ |
-| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | ✅︎ |
+| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 1dc4df85c1bc..01639d398126 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -47,7 +47,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -91,6 +91,7 @@ def __init__(
             self.total_num_heads,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
 
         # Rotary embeddings.
@@ -114,6 +115,7 @@ def __init__(
             self.hidden_size,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
     def forward(
@@ -142,6 +144,7 @@ def __init__(
         self,
         config: OlmoConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -154,6 +157,7 @@ def __init__(
             [self.intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
 
         # Activation function.
@@ -165,6 +169,7 @@ def __init__(
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
 
     def forward(
@@ -197,7 +202,7 @@ def __init__(self,
                                        prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = OlmoMLP(config, quant_config)
+        self.mlp = OlmoMLP(config, quant_config, prefix=f"{prefix}.mlp")
 
         # LayerNorm
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
@@ -326,10 +331,21 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loaded_params
 
 
-class OlmoForCausalLM(nn.Module, SupportsPP):
+class OlmoForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     """
     Extremely barebones HF model wrapper.
     """
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 499e6d30ed6b..66a0f9115585 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -33,6 +33,7 @@
 from transformers import Olmo2Config
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.distributed.communication_op import tensor_model_parallel_all_gather
@@ -48,7 +49,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
 from vllm.model_executor.models.utils import (
     AutoWeightsLoader, is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
@@ -253,6 +254,7 @@ def forward(
         return hidden_states
 
 
+@support_torch_compile
 class Olmo2Model(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -354,10 +356,21 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loaded_params
 
 
-class Olmo2ForCausalLM(nn.Module, SupportsPP):
+class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
     """
     Extremely barebones HF model wrapper.
     """
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From 11b0bcdfe79242253d15c00408fc810baa4d3276 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 20 Aug 2025 21:34:49 +0800
Subject: [PATCH 230/233] [Fix] fix offline env use local mode path (#22526)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 .../offline_mode/test_offline_mode.py         | 35 +++++++++++++++++++
 vllm/engine/arg_utils.py                      | 10 +++++-
 vllm/transformers_utils/config.py             | 23 ++++++++++--
 3 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index a606eeab5887..dd8d63ad319a 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for HF_HUB_OFFLINE mode"""
+import dataclasses
 import importlib
 import sys
 
@@ -9,6 +10,7 @@
 
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.engine.arg_utils import EngineArgs
 
 MODEL_CONFIGS = [
     {
@@ -108,3 +110,36 @@ def _re_import_modules():
     # Error this test if reloading a module failed
     if reload_exception is not None:
         raise reload_exception
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.usefixtures("cache_models")
+def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
+    # Set HF to offline mode and ensure we can still construct an LLM
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
+
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")
+
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )
+            # Need to re-import huggingface_hub
+            # and friends to setup offline mode
+            _re_import_modules()
+            engine_args = EngineArgs(model="facebook/opt-125m")
+            LLM(**dataclasses.asdict(engine_args))
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 679905aed9ec..48d9cd08af03 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -15,6 +15,7 @@
                     Literal, Optional, Type, TypeVar, Union, cast, get_args,
                     get_origin)
 
+import huggingface_hub
 import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
@@ -39,7 +40,7 @@
 from vllm.ray.lazy_utils import is_ray_initialized
 from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
-from vllm.transformers_utils.config import is_interleaved
+from vllm.transformers_utils.config import get_model_path, is_interleaved
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser,
                         GiB_bytes, get_ip, is_in_ray_actor)
@@ -457,6 +458,13 @@ def __post_init__(self):
         # Setup plugins
         from vllm.plugins import load_general_plugins
         load_general_plugins()
+        # when use hf offline,replace model id to local model path
+        if huggingface_hub.constants.HF_HUB_OFFLINE:
+            model_id = self.model
+            self.model = get_model_path(self.model, self.revision)
+            logger.info(
+                "HF_HUB_OFFLINE is True, replace model_id [%s] " \
+                "to model_path [%s]",model_id, self.model)
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index d8c964fb2a4a..fe345bd8f0a2 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -14,7 +14,7 @@
 from huggingface_hub import list_repo_files as hf_list_repo_files
 from huggingface_hub import try_to_load_from_cache
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
-                                   HFValidationError, LocalEntryNotFoundError,
+                                   LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
                                    RevisionNotFoundError)
 from transformers import GenerationConfig, PretrainedConfig
@@ -335,6 +335,7 @@ def maybe_override_with_speculators_target_model(
         gguf_model_repo = Path(model).parent
     else:
         gguf_model_repo = None
+    kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
     config_dict, _ = PretrainedConfig.get_config_dict(
         model if gguf_model_repo is None else gguf_model_repo,
         revision=revision,
@@ -400,6 +401,7 @@ def get_config(
             raise ValueError(error_message) from e
 
     if config_format == ConfigFormat.HF:
+        kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
         config_dict, _ = PretrainedConfig.get_config_dict(
             model,
             revision=revision,
@@ -532,7 +534,7 @@ def try_get_local_file(model: Union[str, Path],
                                                      revision=revision)
             if isinstance(cached_filepath, str):
                 return Path(cached_filepath)
-        except HFValidationError:
+        except ValueError:
             ...
     return None
 
@@ -908,3 +910,20 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
             exc_info=e)
 
     return max_position_embeddings
+
+
+def get_model_path(model: Union[str, Path], revision: Optional[str] = None):
+    if os.path.exists(model):
+        return model
+    assert huggingface_hub.constants.HF_HUB_OFFLINE
+    common_kwargs = {
+        "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
+        "revision": revision,
+    }
+
+    if envs.VLLM_USE_MODELSCOPE:
+        from modelscope.hub.snapshot_download import snapshot_download
+        return snapshot_download(model_id=model, **common_kwargs)
+
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id=model, **common_kwargs)

From 922b71b46b22b71db61c4c9a690683de0774e6be Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 20 Aug 2025 22:19:30 +0800
Subject: [PATCH 231/233] [Bugfix] Ensure correctness of HCXVision processing
 (#23254)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_common.py      |   2 +-
 .../models/hyperclovax_vision.py              | 118 ++++++++----------
 2 files changed, 56 insertions(+), 64 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index d5b1de834a61..02aecfad8281 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -102,7 +102,7 @@ def _test_processing_correctness(
         partial(random_video,
                 rng,
                 min_frames=2,
-                max_frames=8,
+                max_frames=16,
                 min_wh=128,
                 max_wh=256),
         "audio":
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index d3ddc47ea932..f8b30d8d98e5 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -53,6 +53,21 @@
 VIDEO_TOKEN: str = "<|_unuse_missing_100270|>"
 
 
+# Based on combine_frames_into_images in
+# https://huggingface.co/naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B/blob/main/processing_hyperclovax.py
+def get_num_combined_frames(
+        num_frames: int,
+        max_grid_shape: tuple[int, int] = (3, 3),
+) -> int:
+    max_num_grids = max_grid_shape[0] * max_grid_shape[1]
+
+    # Calculate the number of canvases needed.
+    num_canvases = num_frames // max_num_grids
+    leftover_frames = num_frames % max_num_grids
+
+    return num_canvases + (leftover_frames > 0)
+
+
 class HCXVisionMultimodalPixelInputs(TypedDict):
     type: Literal["pixel_values"]
     pixel_values_images: list[torch.Tensor]
@@ -172,23 +187,20 @@ def _call_hf_processor(
         def replace_multimodal_token(
             token_ids: torch.Tensor,
             target_token: int,
-            repeats: list,
+            repeats: list[int],
         ):
-            output = list()
+            output = list[int]()
             _repeats_idx = 0
             for token_id in token_ids:
                 if token_id == target_token:
-                    output += [
-                        token_id.item(),
-                    ] * repeats[_repeats_idx]
+                    output += [token_id.item()] * repeats[_repeats_idx]
                     _repeats_idx += 1
                 else:
-                    output += [
-                        token_id.item(),
-                    ]
+                    output += [token_id.item()]
+
             return torch.tensor(output, device=token_ids.device)
 
-        for video_idx, video_arr in enumerate(mm_data.get("videos", list())):
+        for video_idx, video_arr in enumerate(mm_data.get("videos", [])):
             if video_arr.dtype == np.uint8:
                 continue
             mm_data["videos"][video_idx] = video_arr.astype(np.uint8)
@@ -205,88 +217,68 @@ def replace_multimodal_token(
         if len(mm_data) > 0:
             # batchify input as a single item
             images = mm_data.get("images", None)
-            num_images = 0
-            if images is not None:
-                num_images = len(images)
-                images = [
-                    images,
-                ]  # batchify
-
-            videos = mm_data.get("videos",
-                                 None)  # list of video in single conversation
-            num_videos = 0
-            if videos is not None:
-                num_videos = len(videos)
-                videos = [
-                    videos,
-                ]  # batchify
+            batched_images = None if images is None else [images]
+
+            # list of video in single conversation
+            videos = mm_data.get("videos", None)
+            batched_videos = None if videos is None else [videos]
 
             _processed_outputs = self.info.ctx.call_hf_processor(
                 hf_processor=self.info.get_hf_processor(**mm_kwargs),
                 data=dict(
                     text=None,
-                    images=images,
-                    videos=videos,
+                    images=batched_images,
+                    videos=batched_videos,
                 ),
             )  # mm-only
 
             for k, v in _processed_outputs.items():
-                if len(v) < 1:
-                    continue
-                elif k.endswith("_images"):
-                    # list of list of 4D tensor -> list of 4D tensor
+                if isinstance(v, list) and len(v) > 0:
+                    assert len(v) == 1
                     _processed_outputs[k] = v[0]
-                elif k.endswith("_videos"):
-                    # list of list of 4D tensor -> list of 4D tensor
-                    v = v[0]
-                    if k == "pixel_values_videos":
-                        v = torch.cat(v, dim=0)
-                        _c, _w, _h = v.shape[-3:]
-                        v = v.reshape(num_videos, -1, _c, _w, _h)
-                        v = list(torch.unbind(v, dim=0))
-                    _processed_outputs[k] = v
-
-            if num_images > 0:
+
+            if images:
                 tokenizer = self.info.get_tokenizer()
+                image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
                 processed_outputs["input_ids"] = torch.stack([
                     replace_multimodal_token(
                         token_ids=_input_ids,
-                        target_token=tokenizer.convert_tokens_to_ids(
-                            IMAGE_TOKEN),
+                        target_token=image_token_id,
                         repeats=_processed_outputs[
                             "vision_query_lengths_images"],
                     ) for _input_ids in processed_outputs["input_ids"]
                 ],
                                                              dim=0)
 
-            if num_videos > 0:
+            if videos:
+                _num_per_videos = [
+                    get_num_combined_frames(len(video)) for video in videos
+                ]
+                _processed_outputs["pixel_values_videos"] = [
+                    _processed_outputs["pixel_values_videos"]
+                    [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])]
+                    for _i in range(len(videos))
+                ]
+                _processed_outputs["vision_query_lengths_videos"] = [
+                    _processed_outputs["vision_query_lengths_videos"]
+                    [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])]
+                    for _i in range(len(videos))
+                ]
+
                 tokenizer = self.info.get_tokenizer()
+                video_token_id = tokenizer.convert_tokens_to_ids(VIDEO_TOKEN)
                 processed_outputs["input_ids"] = torch.stack([
                     replace_multimodal_token(
                         token_ids=_input_ids,
-                        target_token=tokenizer.convert_tokens_to_ids(
-                            VIDEO_TOKEN),
-                        repeats=_processed_outputs[
-                            "vision_query_lengths_videos"],
+                        target_token=video_token_id,
+                        repeats=[
+                            sum(lens) for lens in
+                            _processed_outputs["vision_query_lengths_videos"]
+                        ],
                     ) for _input_ids in processed_outputs["input_ids"]
                 ],
                                                              dim=0)
 
-                _ratios = [
-                    len(_pixel_values) for _pixel_values in
-                    _processed_outputs["pixel_values_videos"]
-                ]
-                _num_per_videos = [
-                    int(_e / sum(_ratios) *
-                        len(_processed_outputs["vision_query_lengths_videos"]))
-                    for _e in _ratios
-                ]
-                _processed_outputs["vision_query_lengths_videos"] = [
-                    _processed_outputs["vision_query_lengths_videos"]
-                    [sum(_num_per_videos[:_i]):sum(_num_per_videos[:_i + 1])]
-                    for _i in range(0, num_videos)
-                ]
-
             processed_outputs.update(_processed_outputs)
 
         return processed_outputs

From 5e59970035c1c00ce5d28135b1aefd44019c3c2b Mon Sep 17 00:00:00 2001
From: ShaoChunLee <Shao-Chun.Lee@amd.com>
Date: Sat, 26 Jul 2025 06:27:28 +0000
Subject: [PATCH 232/233] add envs

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/envs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index f7f63104334c..8a1130e5be67 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -163,6 +163,7 @@
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
+    VLLM_AITER_TRITON_FP8_BMM: bool = False
 
 
 def get_default_cache_root():
@@ -1006,6 +1007,9 @@ def get_vllm_port() -> Optional[int]:
     # limit will actually be zero-copy decoded.
     "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
     lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
+    
+    "VLLM_AITER_TRITON_FP8_BMM":
+    lambda: bool(int(os.getenv("VLLM_AITER_TRITON_FP8_BMM", "0"))),
 
     # If set, allow insecure serialization using pickle.
     # This is useful for environments where it is deemed safe to use the

From 64c11d6dcccaa2bea5accf5d6372f385059936a3 Mon Sep 17 00:00:00 2001
From: Divakar Verma <divakar.verma@amd.com>
Date: Tue, 12 Aug 2025 16:32:53 +0000
Subject: [PATCH 233/233] improve env switch. reformat lint

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
---
 vllm/envs.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 8a1130e5be67..f7f63104334c 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -163,7 +163,6 @@
     VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
-    VLLM_AITER_TRITON_FP8_BMM: bool = False
 
 
 def get_default_cache_root():
@@ -1007,9 +1006,6 @@ def get_vllm_port() -> Optional[int]:
     # limit will actually be zero-copy decoded.
     "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
     lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
-    
-    "VLLM_AITER_TRITON_FP8_BMM":
-    lambda: bool(int(os.getenv("VLLM_AITER_TRITON_FP8_BMM", "0"))),
 
     # If set, allow insecure serialization using pickle.
     # This is useful for environments where it is deemed safe to use the