From 5175b884f70d82d57ad7ce5229f579e45d0c502a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 22 Apr 2025 16:27:14 -0700
Subject: [PATCH 01/81] [BugFix] Remove default multiproc executor
 `collective_rpc` timeout (#17000)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/executor/multiproc_executor.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index cff6181fa3ad..cb125bf4bf17 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -38,7 +38,7 @@
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
-EXECUTE_MODEL_TIMEOUT_S = 30
+EXECUTE_MODEL_TIMEOUT_S = 40
 
 
 class MultiprocExecutor(Executor):
@@ -151,16 +151,16 @@ def execute_model(
 
     def collective_rpc(self,
                        method: Union[str, Callable],
-                       timeout: Optional[float] = 180.0,
+                       timeout: Optional[float] = None,
                        args: tuple = (),
                        kwargs: Optional[dict] = None,
                        rank0_reply_only: bool = False) -> list[Any]:
-        start_time = time.monotonic()
-        kwargs = kwargs or {}
-
         if self.is_failed:
             raise RuntimeError("Executor failed.")
 
+        deadline = None if timeout is None else time.monotonic() + timeout
+        kwargs = kwargs or {}
+
         # NOTE: If the args are heterogeneous, then we pack them into a list,
         # and unpack them in the method of every worker, because every worker
         # knows their own rank.
@@ -176,8 +176,8 @@ def collective_rpc(self,
             workers = (self.workers[0], ) if rank0_reply_only else self.workers
             responses = [None] * len(workers)
             for w in workers:
-                dequeue_timeout = timeout - (time.monotonic() - start_time
-                                             ) if timeout is not None else None
+                dequeue_timeout = None if deadline is None else (
+                    deadline - time.monotonic())
                 status, result = w.worker_response_mq.dequeue(
                     timeout=dequeue_timeout, cancel=self.shutdown_event)
 

From 83d933718c82f71e4971b6febe781743a2a52919 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Tue, 22 Apr 2025 17:05:23 -0700
Subject: [PATCH 02/81] [Core][V1][TPU] Enable structured decoding on TPU V1
 (#16499)

Signed-off-by: Chenyaaang <chenyangli@google.com>
---
 .../scripts/hardware_ci/run-tpu-v1-test.sh    |   4 +-
 .../benchmark_serving_structured_output.py    |   2 +-
 tests/v1/tpu/test_sampler.py                  |   7 +-
 vllm/platforms/tpu.py                         |   4 +-
 vllm/v1/worker/tpu_model_runner.py            | 172 +++++++++++++++---
 5 files changed, 158 insertions(+), 31 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 6b5e86a0ebd6..704bc6b7324d 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -44,7 +44,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_9 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
     && echo TEST_10 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
+    && echo TEST_11 \
+    && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index e52f16a8b129..5dd9b1dbd461 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -51,7 +51,7 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from vllm.v1.structured_output.utils import (
+from vllm.v1.structured_output.backend_xgrammar import (
     has_xgrammar_unsupported_json_features)
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 50d40aa9dec2..046d3e404e4f 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -23,7 +23,7 @@ def test_sampler_different(model_name: str):
     different results.
     """
     llm = LLM(model_name,
-              enforce_eager=False,
+              enforce_eager=True,
               max_num_seqs=1,
               max_model_len=512,
               max_num_batched_tokens=512)
@@ -57,4 +57,7 @@ def test_sampler_different(model_name: str):
         # Make sure first two reqs have the same K/P
         sampling_params[0] = sampling_params[1]
         output = llm.generate(p, sampling_params)
-        assert output[0].outputs[0].text == output[1].outputs[0].text
+        # There are natural numerical instabilities that make it difficult
+        # to have deterministic results over many tokens, tests the first ~20
+        # tokens match.
+        assert output[0].outputs[0].text[:20] == output[1].outputs[0].text[:20]
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index b1e221e28b43..fcac5155637f 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -168,9 +168,9 @@ def validate_request(
     ) -> None:
         """Raises if this request is unsupported on this platform"""
         if isinstance(params, SamplingParams):
-            if params.guided_decoding is not None:
+            if params.guided_decoding is not None and not envs.VLLM_USE_V1:
                 raise ValueError("Structured output is not supported on "
-                                 f"{cls.device_name}.")
+                                 f"{cls.device_name} V0.")
             if params.sampling_type == SamplingType.RANDOM_SEED:
                 raise ValueError(
                     "Torch XLA does not support per-request seed.")
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 7eb464660e95..5d94f675f92e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -30,8 +30,9 @@
 from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec, SlidingWindowSpec)
+from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
+                                        KVCacheConfig, KVCacheSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
@@ -148,6 +149,7 @@ def __init__(
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
+        self.vocab_size = model_config.get_vocab_size()
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -178,7 +180,7 @@ def __init__(
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
-            vocab_size=model_config.get_vocab_size(),
+            vocab_size=self.vocab_size,
         )
 
         # Cached torch/numpy tensor
@@ -221,6 +223,20 @@ def __init__(
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 
+        # tensors for structured decoding
+        self.grammar_bitmask_cpu = torch.zeros(
+            (self.max_num_reqs, cdiv(self.vocab_size, 32)),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.require_structured_out_cpu = torch.zeros(
+            (self.max_num_reqs, 1),
+            dtype=torch.bool,
+            device="cpu",
+            pin_memory=self.pin_memory)
+        self.structured_decode_arange = torch.arange(
+            0, 32, device="cpu", pin_memory=self.pin_memory)
+
         # Get maximum number of mm items per modality (batch size).
         self.max_num_mm_items_by_modality = dict()
         if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
@@ -762,9 +778,16 @@ def execute_model(
             )
         hidden_states = self.select_hidden_states(hidden_states,
                                                   logits_indices)
+        logits = self.compute_logits(hidden_states)
         tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
             from_input_batch(self.input_batch, padded_num_reqs, self.device)
-        selected_token_ids = self.sample_from_hidden(hidden_states,
+        if scheduler_output.grammar_bitmask is not None:
+            require_struct_decoding, grammar_bitmask_padded, arange = \
+                self.prepare_structured_decoding_input(logits, scheduler_output)
+            logits = self.structured_decode(require_struct_decoding,
+                                            grammar_bitmask_padded, logits,
+                                            arange)
+        selected_token_ids = self.sample_from_logits(logits,
                                                      tpu_sampling_metadata)
         # Remove padding on cpu and keep dynamic op outside of xla graph.
         selected_token_ids = selected_token_ids.cpu()[:num_reqs]
@@ -997,7 +1020,7 @@ def _precompile_backbone(self) -> None:
             self._dummy_run(num_tokens)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
         self._update_num_xla_graphs("model backbone")
 
     def _precompile_select_hidden_states(self) -> None:
@@ -1026,19 +1049,59 @@ def _precompile_select_hidden_states(self) -> None:
                     break
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
         self._update_num_xla_graphs("select_hidden_states")
 
-    def _precompile_sample_from_hidden(self) -> None:
-        logger.info("Compiling sampling with different num_reqs.")
+    def _precompile_compute_logits(self) -> None:
+        logger.info("Compiling compute_logits with different input shapes.")
         start = time.perf_counter()
         hsize = self.model_config.get_hidden_size()
         for num_reqs in self.num_reqs_paddings:
             dummy_hidden = torch.zeros((num_reqs, hsize),
                                        device=self.device,
                                        dtype=self._hidden_states_dtype)
-            # The first dimension of dummy_hidden cannot be mark_dynamic because
-            # some operations in the sampler require it to be static.
+            torch._dynamo.mark_dynamic(dummy_hidden, 0)
+            self.compute_logits(dummy_hidden)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("compute_logits")
+
+    def _precompile_structured_decoding(self) -> None:
+        logger.info(
+            "Compiling structured_decoding with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            dummy_require_struct_decoding = \
+                self.require_structured_out_cpu[:num_reqs].to(self.device)
+            dummy_grammar_bitmask = \
+                self.grammar_bitmask_cpu[:num_reqs].to(self.device)
+            # The first dimension of the above 3 dummy tensors cannot be
+            # mark_dynamic because some operations in structured_decode require
+            # them to be static.
+            arange = self.structured_decode_arange.to(self.device)
+            self.structured_decode(dummy_require_struct_decoding,
+                                   dummy_grammar_bitmask, dummy_logits, arange)
+            logger.info("  -- num_seqs: %d", num_reqs)
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("structured_decoding")
+
+    def _precompile_sample_from_logits(self) -> None:
+        logger.info(
+            "Compiling sample_from_logits with different input shapes.")
+        start = time.perf_counter()
+        for num_reqs in self.num_reqs_paddings:
+            dummy_logits = torch.zeros((num_reqs, self.vocab_size),
+                                       device=self.device,
+                                       dtype=self._hidden_states_dtype)
+            # The first dimension of dummy_logits cannot be mark_dynamic
+            # because some operations in the sampler require it to be static.
             for all_greedy in [False, True]:
                 generate_params_if_all_greedy = not all_greedy
                 sampling_metadata = (
@@ -1049,12 +1112,12 @@ def _precompile_sample_from_hidden(self) -> None:
                         generate_params_if_all_greedy,
                     ))
                 sampling_metadata.all_greedy = all_greedy
-                self.sample_from_hidden(dummy_hidden, sampling_metadata)
+                self.sample_from_logits(dummy_logits, sampling_metadata)
             logger.info("  -- num_seqs: %d", num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
-        self._update_num_xla_graphs("sampling")
+        logger.info("Compilation finished in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("sample_from_logits")
 
     def capture_model(self) -> None:
         """
@@ -1063,7 +1126,9 @@ def capture_model(self) -> None:
         self._precompile_mm_encoder()
         self._precompile_backbone()
         self._precompile_select_hidden_states()
-        self._precompile_sample_from_hidden()
+        self._precompile_compute_logits()
+        self._precompile_structured_decoding()
+        self._precompile_sample_from_logits()
 
     def profile_run(
         self,
@@ -1144,7 +1209,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 tensor_config = kv_cache_config.tensors[layer_name]
                 assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
                 num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
-                if isinstance(kv_cache_spec, FullAttentionSpec):
+                if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
@@ -1179,16 +1244,14 @@ def select_hidden_states(self, hidden_states, indices_do_sample):
         return hidden_states[indices_do_sample]
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def sample_from_hidden(
-        self,
-        sample_hidden_states: torch.Tensor,
-        sampling_metadata: TPUSupportedSamplingMetadata,
-    ) -> torch.Tensor:
-        """
-        Sample with xla-friendly function. This function is to be traced 
-        separately from `forward` for lighter compilation overhead.
-        """
-        logits = self.model.compute_logits(sample_hidden_states, None)
+    def compute_logits(self,
+                       sample_hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.model.compute_logits(sample_hidden_states, None)
+
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def sample_from_logits(
+            self, logits: torch.Tensor,
+            sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor:
         if sampling_metadata.all_greedy:
             out_tokens = torch.argmax(logits, dim=-1, keepdim=True)
         else:
@@ -1196,12 +1259,71 @@ def sample_from_hidden(
                                       sampling_metadata).sampled_token_ids
         return out_tokens
 
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
+    def structured_decode(self, require_struct_decoding: torch.Tensor,
+                          grammar_bitmask: torch.Tensor, logits: torch.Tensor,
+                          arange: torch.Tensor) -> torch.Tensor:
+        return torch.where(
+            require_struct_decoding,
+            self.apply_grammar_bitmask(logits, grammar_bitmask, arange),
+            logits)
+
+    def apply_grammar_bitmask(self, logits: torch.Tensor,
+                              grammar_bitmask: torch.Tensor,
+                              arange: torch.Tensor):
+        assert (logits.shape[0] == grammar_bitmask.shape[0])
+        logits_cloned = logits.clone()
+        for i in range(logits.shape[0]):
+            unpacked_bitmask = (torch.bitwise_right_shift(
+                grammar_bitmask[i][:, None], arange[None, :]) & 1) == 0
+            unpacked_bitmask = unpacked_bitmask.reshape(-1)[:self.vocab_size]
+            logits_cloned[i] = logits_cloned[i].masked_fill(
+                unpacked_bitmask, -float("inf"))
+        return logits_cloned
+
     def get_multimodal_embeddings(self, *args, **kwargs):
         return self.model.get_multimodal_embeddings(*args, **kwargs)
 
     def get_input_embeddings(self, *args, **kwargs):
         return self.model.get_input_embeddings(*args, **kwargs)
 
+    def prepare_structured_decoding_input(
+        self, logits: torch.Tensor, scheduler_output: "SchedulerOutput"
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        grammar_bitmask = scheduler_output.grammar_bitmask
+        assert grammar_bitmask is not None
+        num_reqs, _ = logits.shape
+
+        # Reset pre-allocated tensors
+        self.grammar_bitmask_cpu.zero_()
+        self.require_structured_out_cpu.zero_()
+
+        # We receive the structured output bitmask from the scheduler, but the
+        # indices of the requests in the batch may not match the indices of
+        # the bitmask since the scheduler doesn't know how the tpu runner is
+        # ordering the requests in the batch. We need to match the order of
+        # bitmask with the order of requests
+        struct_out_indices: list[int] = []
+        mask_indices: list[int] = []
+        for req_id in self.input_batch.req_ids:
+            mask_index = scheduler_output.structured_output_request_ids.get(
+                req_id)
+            if mask_index is None:
+                continue
+            batch_index = self.input_batch.req_id_to_index[req_id]
+            struct_out_indices.append(batch_index)
+            mask_indices.append(mask_index)
+        self.grammar_bitmask_cpu[struct_out_indices] = torch.from_numpy(
+            grammar_bitmask[mask_indices])
+        # It's not guaranteed that all requests in this batch require
+        # structured output, so create a bool tensor to represent
+        # the requests that need structured output.
+        struct_out_indices = torch.tensor(struct_out_indices, dtype=torch.long)
+        self.require_structured_out_cpu[struct_out_indices] = True
+        return self.require_structured_out_cpu[:num_reqs].to(logits.device), \
+            self.grammar_bitmask_cpu[:num_reqs].to(logits.device), \
+            self.structured_decode_arange.to(logits.device)
+
     def _get_mm_dummy_batch(self, modality: str,
                             batch_size: int) -> BatchedTensorInputs:
         # Dummy data for pre-compiling multimodal models.

From 36fe78769ff4134c59bb675f2805364938092079 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Wed, 23 Apr 2025 03:43:06 +0200
Subject: [PATCH 03/81] [Bugfix] validate urls object for multimodal content
 parts (#16990)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 tests/entrypoints/openai/test_audio.py  | 29 ++++++++++++++++++++++++
 tests/entrypoints/openai/test_video.py  | 29 ++++++++++++++++++++++++
 tests/entrypoints/openai/test_vision.py | 30 +++++++++++++++++++++++++
 vllm/entrypoints/chat_utils.py          | 10 +++++----
 4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 29d5a85af613..72e616656775 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -104,6 +104,35 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
+async def test_error_on_invalid_audio_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               audio_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "audio_url",
+                "audio_url": audio_url
+            },
+            {
+                "type": "text",
+                "text": "What's happening in this audio?"
+            },
+        ],
+    }]
+
+    # audio_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 8679c2f25db4..53f057a294c0 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -108,6 +108,35 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_error_on_invalid_video_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": video_url
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # video_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 87b5cee73ecb..1ab50b41c7ec 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -137,6 +137,36 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_error_on_invalid_image_url_type(client: openai.AsyncOpenAI,
+                                               model_name: str,
+                                               image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": image_url
+            },
+            {
+                "type": "text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    # image_url should be a dict {"url": "some url"}, not directly a string
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=model_name,
+                                                 messages=messages,
+                                                 max_completion_tokens=10,
+                                                 temperature=0.0)
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 2e19ebcdd61f..bd2c3357cdc0 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -27,10 +27,11 @@
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
+from pydantic import TypeAdapter
 # yapf: enable
-# pydantic needs the TypedDict from typing_extensions
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                           ProcessorMixin)
+# pydantic needs the TypedDict from typing_extensions
 from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
@@ -879,12 +880,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
-_ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
-_AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
-_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
+# Need to validate url objects
+_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
+_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
+_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
 _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
 

From f67e9e9f221e9791733b827585d6eb6dbc23133c Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 22 Apr 2025 19:08:27 -0700
Subject: [PATCH 04/81] add Dockerfile build vllm against torch nightly
 (#16936)

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 .buildkite/test-pipeline.yaml       |   3 +
 docker/Dockerfile.nightly_torch     | 307 ++++++++++++++++++++++++++++
 requirements/nightly_torch_test.txt |  28 +++
 3 files changed, 338 insertions(+)
 create mode 100644 docker/Dockerfile.nightly_torch
 create mode 100644 requirements/nightly_torch_test.txt

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 898b8e5da2f4..95e38305e1e1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -8,6 +8,7 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
+# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@@ -70,6 +71,7 @@ steps:
 - label: Basic Correctness Test # 30min
   #mirror_hardwares: [amd]
   fast_check: true
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/basic_correctness/test_basic_correctness
@@ -104,6 +106,7 @@ steps:
 - label: Entrypoints Test # 40min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
+  torch_nightly: true
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
new file mode 100644
index 000000000000..0063712e4781
--- /dev/null
+++ b/docker/Dockerfile.nightly_torch
@@ -0,0 +1,307 @@
+# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing
+
+# for torch nightly, cuda >=12.6 is required,
+# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628)
+ARG CUDA_VERSION=12.8.0
+#
+#################### BASE BUILD IMAGE ####################
+# prepare basic build environment
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
+ENV DEBIAN_FRONTEND=noninteractive
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version \
+    && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+# install build and runtime dependencies without stable torch version
+RUN python3 use_existing_torch.py
+
+# install torch nightly
+ARG PINNED_TORCH_VERSION
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$PINNED_TORCH_VERSION" ]; then \
+      pkgs="$PINNED_TORCH_VERSION"; \
+    else \
+      pkgs="torch torchaudio torchvision"; \
+    fi && \
+    uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system numba==0.61.2
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+uv pip install --system -r requirements/common.txt
+
+# must put before installing xformers, so it can install the correct version of xfomrers.
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
+# Build xformers with cuda and torch nightly
+# following official xformers guidance: https://github.com/facebookresearch/xformers#build
+# todo(elainewy): cache xformers build result for faster build
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+     --mount=type=cache,target=/root/.cache/uv \
+    echo 'git clone xformers...' \
+    && git clone https://github.com/facebookresearch/xformers.git --recursive \
+    && cd xformers \
+    && git checkout ${XFORMERS_COMMIT} \
+    && git submodule update --init --recursive \
+    && echo 'finish git clone xformers...' \
+    && rm -rf build \
+    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
+    && cd .. \
+    && rm -rf xformers
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system xformers-dist/*.whl --verbose
+
+# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
+# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+RUN cat torch_build_versions.txt
+
+# cuda arch list used by torch
+# can be useful for `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
+#################### BASE BUILD IMAGE ####################
+
+#################### WHEEL BUILD IMAGE ####################
+FROM base AS build
+ARG TARGETPLATFORM
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+COPY . .
+
+RUN python3 use_existing_torch.py
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# Max jobs used by Ninja to build extensions
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG nvcc_threads=2
+ENV NVCC_THREADS=$nvcc_threads
+
+ARG USE_SCCACHE
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi
+
+#################### WHEEL BUILD IMAGE ####################
+
+################### VLLM INSTALLED IMAGE ####################
+# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
+# prepare for environment starts
+ARG CUDA_VERSION=12.8.0
+ARG PYTHON_VERSION=3.12
+WORKDIR /vllm-workspace
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# get the nightly torch version used in the build to make sure the version is the same
+COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
+
+# install the vllm wheel
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system vllm-dist/*.whl --verbose
+
+# install xformers again for the new environment
+RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
+
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+
+# install package for build flashinfer
+# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
+RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
+
+
+# build flashinfer for torch nightly from source around 10 mins
+# release version: v0.2.2.post1
+# todo(elainewy): cache flashinfer build result for faster build
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    echo "git clone flashinfer..." \
+    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && cd flashinfer \
+    && git checkout v0.2.2.post1 \
+    && git submodule update --init --recursive \
+    && echo "finish git clone flashinfer..." \
+    && rm -rf build \
+    && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
+    && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
+    && cd .. \
+    && rm -rf flashinfer
+
+# install flashinfer
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system flashinfer-dist/*.whl --verbose
+
+# install common packages
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+
+RUN python3 use_existing_torch.py
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+
+################### VLLM INSTALLED IMAGE ####################
+
+
+#################### UNITTEST IMAGE #############################
+FROM vllm-base as test
+COPY tests/ tests/
+
+# install build and runtime dependencies without stable torch version
+COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/nightly_torch_test.txt
+
+#################### UNITTEST IMAGE #############################
+
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
new file mode 100644
index 000000000000..20372a9b2ef1
--- /dev/null
+++ b/requirements/nightly_torch_test.txt
@@ -0,0 +1,28 @@
+# Dependency that able to run entrypoints test
+# pytest and its extensions
+pytest
+pytest-asyncio
+pytest-forked
+pytest-mock
+pytest-rerunfailures
+pytest-shard
+pytest-timeout
+
+
+librosa # required by audio tests in entrypoints/openai
+sentence-transformers
+numba == 0.61.2; python_version > '3.9'
+# testing utils
+awscli
+boto3
+botocore
+datasets
+ray >= 2.10.0
+peft
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
+tensorizer>=2.9.0
+lm-eval==0.4.8
+buildkite-test-collector==0.1.9
+
+lm-eval[api]==0.4.8 # required for model evaluation test

From bc7c4d206bbfb56b06d218b6c2971e8ca191db36 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Tue, 22 Apr 2025 19:11:56 -0700
Subject: [PATCH 05/81] [Kernel][ROCM] Upstream prefix prefill speed up for
 vLLM V1 (#13305)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: root <root@banff-cyxtera-s65-4.amd.com>
Signed-off-by: maleksan85 <maleksan@amd.com>
Signed-off-by: <>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: qli88 <qiang.li2@amd.com>
Co-authored-by: root <root@banff-cyxtera-s65-4.amd.com>
---
 tests/core/block/e2e/test_correctness.py |    6 +-
 vllm/attention/ops/prefix_prefill.py     | 1634 +++++++++++-----------
 2 files changed, 824 insertions(+), 816 deletions(-)

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index e9b537ed5150..9e8e315d87b1 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -195,15 +195,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
     ])
 @pytest.mark.parametrize("per_test_common_llm_kwargs",
                          [{
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 2,
                              "max_num_seqs": 2,
                          }, {
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 3,
                              "max_num_seqs": 2,
                          }, {
-                             "block_size": 8,
+                             "block_size": 16,
                              "max_num_batched_tokens": 256,
                              "max_num_seqs": 10,
                          }])
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e0478c2aebda..a8c8d8409620 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -16,831 +16,778 @@
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)
 
-if triton.__version__ >= "2.1.0":
-
-    @triton.jit
-    def _fwd_kernel(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        k_scale,
-        v_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        IN_PRECISION: tl.constexpr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,  # head size
-        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
-        BLOCK_N: tl.constexpr,
-        SLIDING_WINDOW: tl.constexpr,
-        SKIP_DECODE: tl.constexpr,
-    ):
-
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-        cur_batch_query_len = (cur_batch_in_all_stop_index -
-                               cur_batch_in_all_start_index)
-        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
-
-        if SKIP_DECODE and cur_batch_query_len == 1:
-            return
-
-        # start position inside of the query
-        # generally, N goes over kv, while M goes over query_len
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        # [N]; starts at 0
-        offs_n = tl.arange(0, BLOCK_N)
-        # [D]; starts at 0
-        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
-        # [M]; starts at current position in query
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        # [M,D]
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
-            0).to(tl.int1)  # [D]
-
-        q = tl.load(Q + off_q,
-                    mask=dim_mask[None, :] &
-                    (offs_m[:, None] < cur_batch_query_len),
-                    other=0.0)  # [M,D]
-
-        # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")  # [M]
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)  # [M]
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED],
-                       dtype=tl.float32)  # [M,D]
-
-        # compute query against context (no causal mask here)
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)  # [N]
-            # [D,N]
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            # [N,D]
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k_load = tl.load(K_cache + off_k,
-                             mask=dim_mask[:, None] &
-                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                             other=0.0)  # [D,N]
-
-            if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
-            else:
-                k = k_load
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)  # [M,N]
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-            if SLIDING_WINDOW > 0:
-                # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
-                # Q entries in sequence
-                # (start_n + offs_n[None, :]) are the positions of
-                # KV entries in sequence
-                # So the condition makes sure each entry in Q only attends
-                # to KV entries not more than SLIDING_WINDOW away.
-                #
-                # We can't use -inf here, because the
-                # sliding window may lead to the entire row being masked.
-                # This then makes m_ij contain -inf, which causes NaNs in
-                # exp().
-                qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
-                              (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk,
-                              -10000)
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)  # [M]
-            p = tl.exp(qk - m_ij[:, None])  # [M,N]
-            l_ij = tl.sum(p, 1)  # [M]
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)  # [M]
-            alpha = tl.exp(m_i - m_i_new)  # [M]
-            beta = tl.exp(m_ij - m_i_new)  # [M]
-            l_i_new = alpha * l_i + beta * l_ij  # [M]
-
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v_load = tl.load(V_cache + off_v,
-                             mask=dim_mask[None, :] &
-                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                             other=0.0)  # [N,D]
-            if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
-            else:
-                v = v_load
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
-            # # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        # block_mask is 0 when we're already past the current query length
-        block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
-
-        # compute query against itself (with causal mask)
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :]) < cur_batch_query_len),
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk *= sm_scale
-            # apply causal mask
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-            if SLIDING_WINDOW > 0:
-                qk = tl.where(
-                    offs_m[:, None] - (start_n + offs_n[None, :])
-                    < SLIDING_WINDOW, qk, -10000)
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            p = tl.exp(qk - m_ij[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-            m_i_new = tl.maximum(m_i, m_ij)
-            alpha = tl.exp(m_i - m_i_new)
-            beta = tl.exp(m_ij - m_i_new)
-            l_i_new = alpha * l_i + beta * l_ij
-            # -- update output accumulator --
-            # scale p
-            p_scale = beta / l_i_new
-            p = p * p_scale[:, None]
-            # scale acc
-            acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None]) < cur_batch_query_len),
-                        other=0.0)
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=dim_mask[None, :] &
-                 (offs_m[:, None] < cur_batch_query_len))
+
+# Here's an example autotuner config for this kernel. This config does provide
+# a performance improvement, but dramatically increases first call latency in
+# triton 3.2. Because of this tradeoff, it's currently commented out.
+# @triton.autotune(
+#     configs=[
+#         triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, \
+#                         "num_unroll_cache": 4, \
+#                         "num_unroll_request": 1 } | \
+#                         ({"kpack": 2, "waves_per_eu": 2} \
+#                             if current_platform.is_rocm() else {}), \
+#                         num_warps=4, \
+#                         num_stages=1)
+#     ],
+#     key=["BLOCK_SIZE", "MAX_Q_LEN", "MAX_CTX_LEN"]
+# )
+@triton.jit
+def _fwd_kernel(Q,
+                K,
+                V,
+                K_cache,
+                V_cache,
+                B_Loc,
+                sm_scale,
+                k_scale,
+                v_scale,
+                B_Start_Loc,
+                B_Seqlen,
+                x: tl.constexpr,
+                Out,
+                stride_b_loc_b,
+                stride_b_loc_s,
+                stride_qbs,
+                stride_qh,
+                stride_qd,
+                stride_kbs,
+                stride_kh,
+                stride_kd,
+                stride_vbs,
+                stride_vh,
+                stride_vd,
+                stride_obs,
+                stride_oh,
+                stride_od,
+                stride_k_cache_bs,
+                stride_k_cache_h,
+                stride_k_cache_d,
+                stride_k_cache_bl: tl.constexpr,
+                stride_k_cache_x,
+                stride_v_cache_bs,
+                stride_v_cache_h,
+                stride_v_cache_d,
+                stride_v_cache_bl,
+                num_queries_per_kv: tl.constexpr,
+                IN_PRECISION: tl.constexpr,
+                BLOCK_M: tl.constexpr,
+                BLOCK_DMODEL: tl.constexpr,
+                BLOCK_DMODEL_PADDED: tl.constexpr,
+                BLOCK_SIZE: tl.constexpr,
+                BLOCK_N: tl.constexpr,
+                SLIDING_WINDOW: tl.constexpr,
+                num_unroll_cache: tl.constexpr,
+                num_unroll_request: tl.constexpr,
+                SKIP_DECODE: tl.constexpr,
+                MAX_Q_LEN: tl.constexpr = 0,
+                MAX_CTX_LEN: tl.constexpr = 0):
+
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
+
+    if SKIP_DECODE and cur_batch_query_len == 1:
         return
 
-    @triton.jit
-    def _fwd_kernel_flash_attn_v2(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        B_Ctxlen,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-    ):
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        q = tl.load(Q + off_q,
-                    mask=offs_m[:, None]
-                    < cur_batch_seq_len - cur_batch_ctx_len,
+    # start position inside of the query
+    # generally, N goes over kv, while M goes over query_len
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    # [BLOCK_SIZE]; starts at 0
+    offs_bs_n = tl.arange(0, BLOCK_SIZE)
+    # [N]; starts at 0
+    offs_n = tl.arange(0, BLOCK_N)
+    # [D]; starts at 0
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    # [M]; starts at current position in query
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # [M,D]
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1,
+        0).to(tl.int1)  # [D]
+
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_query_len),
+                other=0.0)  # [M,D]
+
+    # initialize pointer to m and l
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)  # [M,D]
+
+    # compute query against context (no causal mask here)
+    for start_n in tl.range(0, cur_batch_ctx_len, BLOCK_SIZE, \
+                            loop_unroll_factor=num_unroll_cache):
+        start_n = tl.multiple_of(start_n, BLOCK_SIZE)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     (start_n // BLOCK_SIZE) * stride_b_loc_s)
+        # [D,BLOCK_SIZE]
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_bs_n[None, :]) % BLOCK_SIZE) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+
+        # [BLOCK_SIZE,D]
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 offs_bs_n[:, None] * stride_v_cache_bl)
+
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            k_load = tl.load(
+                K_cache + off_k,
+                mask=dim_mask[:, None] &
+                ((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len),
+                other=0.0)  # [D,N]
+        else:
+            k_load = tl.load(K_cache + off_k)
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_SIZE], dtype=tl.float32)  # [M,N]
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_bs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+        if SLIDING_WINDOW > 0:
+            # (cur_batch_ctx_len + offs_m[:, None]) are the positions of
+            # Q entries in sequence
+            # (start_n + offs_bs_n[None, :]) are the positions of
+            # KV entries in sequence
+            # So the condition makes sure each entry in Q only attends
+            # to KV entries not more than SLIDING_WINDOW away.
+            #
+            # We can't use -inf here, because the
+            # sliding window may lead to the entire row being masked.
+            # This then makes m_ij contain -inf, which causes NaNs in
+            # exp().
+            qk = tl.where((cur_batch_ctx_len + offs_m[:, None]) -
+                          (start_n + offs_bs_n[None, :]) < SLIDING_WINDOW, qk,
+                          -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        if start_n + BLOCK_SIZE > cur_batch_ctx_len or \
+            BLOCK_DMODEL != BLOCK_DMODEL_PADDED:
+            v_load = tl.load(
+                V_cache + off_v,
+                mask=dim_mask[None, :] &
+                ((start_n + offs_bs_n[:, None]) < cur_batch_ctx_len),
+                other=0.0)  # [N,D]
+        else:
+            v_load = tl.load(V_cache + off_v)
+
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    # block_mask is 0 when we're already past the current query length
+    block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
+
+    # compute query against itself (with causal mask)
+    for start_n in tl.range(0, \
+                        block_mask * (start_m + 1) * BLOCK_M, BLOCK_N, \
+                        loop_unroll_factor=num_unroll_request):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=dim_mask[:, None] &
+                    ((start_n + offs_n[None, :]) < cur_batch_query_len),
                     other=0.0)
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
-
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k = tl.load(K_cache + off_k,
-                        mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
-                        other=0.0)
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(V_cache + off_v,
-                        mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=(start_n + offs_n[None, :])
-                        < cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk += tl.dot(q, k)
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=(start_n + offs_n[:, None])
-                        < cur_batch_seq_len - cur_batch_ctx_len,
-                        other=0.0)
-
-            p = p.to(v.dtype)
-            acc += tl.dot(p, v)
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        # acc /= l_i[:, None]
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
-        return
-
-    @triton.jit
-    def _fwd_kernel_alibi(
-        Q,
-        K,
-        V,
-        K_cache,
-        V_cache,
-        B_Loc,
-        sm_scale,
-        k_scale,
-        v_scale,
-        B_Start_Loc,
-        B_Seqlen,
-        Alibi_slopes,
-        block_size,
-        x,
-        Out,
-        stride_b_loc_b,
-        stride_b_loc_s,
-        stride_qbs,
-        stride_qh,
-        stride_qd,
-        stride_kbs,
-        stride_kh,
-        stride_kd,
-        stride_vbs,
-        stride_vh,
-        stride_vd,
-        stride_obs,
-        stride_oh,
-        stride_od,
-        stride_k_cache_bs,
-        stride_k_cache_h,
-        stride_k_cache_d,
-        stride_k_cache_bl,
-        stride_k_cache_x,
-        stride_v_cache_bs,
-        stride_v_cache_h,
-        stride_v_cache_d,
-        stride_v_cache_bl,
-        num_queries_per_kv: int,
-        IN_PRECISION: tl.constexpr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_DMODEL: tl.constexpr,  # head size
-        BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
-        BLOCK_N: tl.constexpr,
-        SKIP_DECODE: tl.constexpr,
-    ):
-        # attn_bias[]
-        cur_batch = tl.program_id(0)
-        cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
-
-        cur_kv_head = cur_head // num_queries_per_kv
-
-        # cur_batch_seq_len: the length of prompts
-        # cur_batch_ctx_len: the length of prefix
-        # cur_batch_in_all_start_index: the start id of the dim=0
-        cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
-        cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
-        cur_batch_query_len = (cur_batch_in_all_stop_index -
-                               cur_batch_in_all_start_index)
-        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
-
-        if SKIP_DECODE and cur_batch_query_len == 1:
-            return
-
-        block_start_loc = BLOCK_M * start_m
-
-        # initialize offsets
-        offs_n = tl.arange(0, BLOCK_N)
-        offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
-        offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-        off_q = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
-            cur_head * stride_qh + offs_d[None, :] * stride_qd)
-
-        dim_mask = tl.where(
-            tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
-
-        q = tl.load(Q + off_q,
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk *= sm_scale
+        # apply causal mask
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+        if SLIDING_WINDOW > 0:
+            qk = tl.where(
+                offs_m[:, None] - (start_n + offs_n[None, :]) < SLIDING_WINDOW,
+                qk, -10000)
+
+        # compute running maximum
+        m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, axis=1)
+        alpha = tl.exp(m_i - m_ij)
+        acc = acc * alpha[:, None]
+
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
                     mask=dim_mask[None, :] &
-                    (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+                    ((start_n + offs_n[:, None]) < cur_batch_query_len),
+                    other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION)
+        # update m_i and l_i
+        l_i = l_i * alpha + l_ij
+        m_i = m_ij
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))
+    return
+
+
+@triton.jit
+def _fwd_kernel_flash_attn_v2(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    B_Ctxlen,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    q = tl.load(Q + off_q,
+                mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k = tl.load(K_cache + off_k,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+                    other=0.0)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(V_cache + off_v,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :])
+                    < cur_batch_seq_len - cur_batch_ctx_len,
                     other=0.0)
 
-        # # initialize pointer to m and l
-        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-        l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
-        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
-
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = 0
-        for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
-                         ((start_n + offs_n) // block_size) * stride_b_loc_s,
-                         mask=(start_n + offs_n) < cur_batch_ctx_len,
-                         other=0)
-            off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_kv_head * stride_k_cache_h +
-                     (offs_d[:, None] // x) * stride_k_cache_d +
-                     ((start_n + offs_n[None, :]) % block_size) *
-                     stride_k_cache_bl +
-                     (offs_d[:, None] % x) * stride_k_cache_x)
-            off_v = (
-                bn[:, None] * stride_v_cache_bs +
-                cur_kv_head * stride_v_cache_h +
-                offs_d[None, :] * stride_v_cache_d +
-                (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
-            k_load = tl.load(K_cache + off_k,
-                             mask=dim_mask[:, None] &
-                             ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
-                             other=0.0)  # [D,N]
-
-            if k_load.dtype.is_fp8():
-                k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
-            else:
-                k = k_load
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
-            qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
-                          float("-inf"))
-            qk *= sm_scale
-
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v_load = tl.load(V_cache + off_v,
-                             mask=dim_mask[None, :] &
-                             ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
-                             other=0.0)
-            if v_load.dtype.is_fp8():
-                v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
-            else:
-                v = v_load
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
-                 offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
-                 offs_d[None, :] * stride_vd)
-        k_ptrs = K + off_k
-        v_ptrs = V + off_v
-
-        block_mask = tl.where(
-            block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
-
-        # init alibi
-        alibi_slope = tl.load(Alibi_slopes + cur_head)
-        alibi_start_q = tl.arange(
-            0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
-        alibi_start_k = cur_batch_ctx_len
-        # # init debugger
-        # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
-        # offset_db_k = tl.arange(0, BLOCK_N)
-        # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
-        for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
-            start_n = tl.multiple_of(start_n, BLOCK_N)
-            # -- compute qk ----
-            k = tl.load(k_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                        mask=dim_mask[:, None] &
-                        ((start_n + offs_n[None, :])
-                         < cur_batch_seq_len - cur_batch_ctx_len),
-                        other=0.0)
-
-            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-            qk = tl.dot(q, k, acc=qk, input_precision='ieee')
-            qk *= sm_scale
-            qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
-                          float("-inf"))
-
-            # load alibi
-            alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
-                     alibi_start_q[:, None]) * alibi_slope
-            alibi = tl.where(
-                (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len),
-                alibi, float("-inf"))
-            qk += alibi
-            alibi_start_k += BLOCK_N
-
-            # -- compute m_ij, p, l_ij
-            m_ij = tl.max(qk, 1)
-            m_i_new = tl.maximum(m_i, m_ij)
-            p = tl.math.exp(qk - m_i_new[:, None])
-            l_ij = tl.sum(p, 1)
-            # -- update m_i and l_i
-
-            alpha = tl.math.exp(m_i - m_i_new)
-            l_i_new = alpha * l_i + l_ij
-            # -- update output accumulator --
-            # scale p
-            # scale acc
-            acc_scale = alpha
-            # acc_scale = l_i / l_i_new * alpha
-            acc = acc * acc_scale[:, None]
-            # update acc
-            v = tl.load(v_ptrs +
-                        (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                        mask=dim_mask[None, :] &
-                        ((start_n + offs_n[:, None])
-                         < cur_batch_seq_len - cur_batch_ctx_len),
-                        other=0.0)
-            p = p.to(v.dtype)
-
-            acc = tl.dot(p, v, acc=acc, input_precision='ieee')
-            # update m_i and l_i
-            l_i = l_i_new
-            m_i = m_i_new
-
-        acc = acc / l_i[:, None]
-
-        # initialize pointers to output
-        off_o = (
-            (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
-            cur_head * stride_oh + offs_d[None, :] * stride_od)
-        out_ptrs = Out + off_o
-        tl.store(out_ptrs,
-                 acc,
-                 mask=dim_mask[None, :] &
-                 (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs +
+                    (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None])
+                    < cur_batch_seq_len - cur_batch_ctx_len,
+                    other=0.0)
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    # acc /= l_i[:, None]
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+    return
+
+
+@triton.jit
+def _fwd_kernel_alibi(
+    Q,
+    K,
+    V,
+    K_cache,
+    V_cache,
+    B_Loc,
+    sm_scale,
+    k_scale,
+    v_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    Alibi_slopes,
+    block_size,
+    x,
+    Out,
+    stride_b_loc_b,
+    stride_b_loc_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    stride_k_cache_bs,
+    stride_k_cache_h,
+    stride_k_cache_d,
+    stride_k_cache_bl,
+    stride_k_cache_x,
+    stride_v_cache_bs,
+    stride_v_cache_h,
+    stride_v_cache_d,
+    stride_v_cache_bl,
+    num_queries_per_kv: int,
+    IN_PRECISION: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,  # head size
+    BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
+    BLOCK_N: tl.constexpr,
+    SKIP_DECODE: tl.constexpr,
+):
+    # attn_bias[]
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // num_queries_per_kv
+
+    # cur_batch_seq_len: the length of prompts
+    # cur_batch_ctx_len: the length of prefix
+    # cur_batch_in_all_start_index: the start id of the dim=0
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+    cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+    cur_batch_query_len = (cur_batch_in_all_stop_index -
+                           cur_batch_in_all_start_index)
+    cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
+
+    if SKIP_DECODE and cur_batch_query_len == 1:
         return
 
-    @torch.inference_mode()
-    def context_attention_fwd(q,
-                              k,
-                              v,
-                              o,
-                              kv_cache_dtype: str,
-                              k_cache,
-                              v_cache,
-                              b_loc,
-                              b_start_loc,
-                              b_seq_len,
-                              max_seq_len,
-                              max_input_len,
-                              k_scale: torch.Tensor,
-                              v_scale: torch.Tensor,
-                              alibi_slopes=None,
-                              sliding_window=None,
-                              sm_scale=None,
-                              skip_decode=False):
-
-        q_dtype_is_f32 = q.dtype is torch.float32
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
+             cur_head * stride_qh + offs_d[None, :] * stride_qd)
+
+    dim_mask = tl.where(
+        tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
+
+    q = tl.load(Q + off_q,
+                mask=dim_mask[None, :] &
+                (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len),
+                other=0.0)
+
+    # # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
+
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = 0
+    for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
+                     ((start_n + offs_n) // block_size) * stride_b_loc_s,
+                     mask=(start_n + offs_n) < cur_batch_ctx_len,
+                     other=0)
+        off_k = (
+            bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
+            (offs_d[:, None] // x) * stride_k_cache_d +
+            ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl +
+            (offs_d[:, None] % x) * stride_k_cache_x)
+        off_v = (bn[:, None] * stride_v_cache_bs +
+                 cur_kv_head * stride_v_cache_h +
+                 offs_d[None, :] * stride_v_cache_d +
+                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
+        k_load = tl.load(K_cache + off_k,
+                         mask=dim_mask[:, None] &
+                         ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
+                         other=0.0)  # [D,N]
+
+        if k_load.dtype.is_fp8():
+            k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype)
+        else:
+            k = k_load
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION)
+        qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk,
+                      float("-inf"))
+        qk *= sm_scale
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v_load = tl.load(V_cache + off_v,
+                         mask=dim_mask[None, :] &
+                         ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
+                         other=0.0)
+        if v_load.dtype.is_fp8():
+            v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype)
+        else:
+            v = v_load
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
+             offs_d[:, None] * stride_kd)
+    off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
+             offs_d[None, :] * stride_vd)
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    block_mask = tl.where(
+        block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+
+    # init alibi
+    alibi_slope = tl.load(Alibi_slopes + cur_head)
+    alibi_start_q = tl.arange(0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
+    alibi_start_k = cur_batch_ctx_len
+    # # init debugger
+    # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
+    # offset_db_k = tl.arange(0, BLOCK_N)
+    # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
+    for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=dim_mask[:, None] & ((start_n + offs_n[None, :])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.dot(q, k, acc=qk, input_precision='ieee')
+        qk *= sm_scale
+        qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk,
+                      float("-inf"))
+
+        # load alibi
+        alibi = (tl.arange(0, BLOCK_N)[None, :] + alibi_start_k -
+                 alibi_start_q[:, None]) * alibi_slope
+        alibi = tl.where(
+            (alibi <= 0) & (alibi_start_q[:, None] < cur_batch_seq_len), alibi,
+            float("-inf"))
+        qk += alibi
+        alibi_start_k += BLOCK_N
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        m_i_new = tl.maximum(m_i, m_ij)
+        p = tl.math.exp(qk - m_i_new[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+
+        alpha = tl.math.exp(m_i - m_i_new)
+        l_i_new = alpha * l_i + l_ij
+        # -- update output accumulator --
+        # scale p
+        # scale acc
+        acc_scale = alpha
+        # acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=dim_mask[None, :] & ((start_n + offs_n[:, None])
+                                      < cur_batch_seq_len - cur_batch_ctx_len),
+            other=0.0)
+        p = p.to(v.dtype)
+
+        acc = tl.dot(p, v, acc=acc, input_precision='ieee')
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+
+    acc = acc / l_i[:, None]
+
+    # initialize pointers to output
+    off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
+             cur_head * stride_oh + offs_d[None, :] * stride_od)
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs,
+             acc,
+             mask=dim_mask[None, :] &
+             (offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len))
+    return
+
+
+@torch.inference_mode()
+def context_attention_fwd(q,
+                          k,
+                          v,
+                          o,
+                          kv_cache_dtype: str,
+                          k_cache,
+                          v_cache,
+                          b_loc,
+                          b_start_loc,
+                          b_seq_len,
+                          max_seq_len,
+                          max_input_len,
+                          k_scale: torch.Tensor,
+                          v_scale: torch.Tensor,
+                          alibi_slopes=None,
+                          sliding_window=None,
+                          sm_scale=None,
+                          skip_decode=False):
+
+    q_dtype_is_f32 = q.dtype is torch.float32
+
+    # Turing does have tensor core for float32 multiplication
+    # use ieee as fallback for triton kernels work. There is also
+    # warning on vllm/config.py to inform users this fallback
+    # implementation
+    IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
+
+    # Conversion of FP8 Tensor from uint8 storage to
+    # appropriate torch.dtype for interpretation by Triton
+    if "fp8" in kv_cache_dtype:
+        assert (k_cache.dtype == torch.uint8)
+        assert (v_cache.dtype == torch.uint8)
+
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            target_dtype = current_platform.fp8_dtype()
+        elif kv_cache_dtype == "fp8_e5m2":
+            target_dtype = torch.float8_e5m2
+        else:
+            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+
+        k_cache = k_cache.view(target_dtype)
+        v_cache = v_cache.view(target_dtype)
+
+    if (k_cache.dtype == torch.uint8
+            or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
+        raise ValueError("kv_cache_dtype='auto' unsupported for\
+            FP8 KV Cache prefill kernel")
+
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    # round up Lk to a power of 2 - this is required for Triton block size
+    Lk_padded = triton.next_power_of_2(Lk)
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (Lq**0.5)
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    num_queries_per_kv = q.shape[1] // k.shape[1]
+
+    assert batch + 1 == len(b_start_loc)
+
+    # 0 means "disable"
+    if sliding_window is None or sliding_window <= 0:
+        sliding_window = 0
+
+    if alibi_slopes is not None:
         # need to reduce num. blocks when using fp32
         # due to increased use of GPU shared memory
         # if q.dtype is torch.float32:
         BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK
-
-        # Turing does have tensor core for float32 multiplication
-        # use ieee as fallback for triton kernels work. There is also
-        # warning on vllm/config.py to inform users this fallback
-        # implementation
-        IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None
-
-        # Conversion of FP8 Tensor from uint8 storage to
-        # appropriate torch.dtype for interpretation by Triton
-        if "fp8" in kv_cache_dtype:
-            assert (k_cache.dtype == torch.uint8)
-            assert (v_cache.dtype == torch.uint8)
-
-            if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-                target_dtype = current_platform.fp8_dtype()
-            elif kv_cache_dtype == "fp8_e5m2":
-                target_dtype = torch.float8_e5m2
-            else:
-                raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
-
-            k_cache = k_cache.view(target_dtype)
-            v_cache = v_cache.view(target_dtype)
-
-        if (k_cache.dtype == torch.uint8
-                or v_cache.dtype == torch.uint8 and kv_cache_dtype == "auto"):
-            raise ValueError("kv_cache_dtype='auto' unsupported for\
-                FP8 KV Cache prefill kernel")
-
-        # shape constraints
-        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-        assert Lq == Lk and Lk == Lv
-        # round up Lk to a power of 2 - this is required for Triton block size
-        Lk_padded = triton.next_power_of_2(Lk)
-
-        if sm_scale is None:
-            sm_scale = 1.0 / (Lq**0.5)
-        batch, head = b_seq_len.shape[0], q.shape[1]
-        num_queries_per_kv = q.shape[1] // k.shape[1]
-
-        assert batch + 1 == len(b_start_loc)
-        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
-
-        # 0 means "disable"
-        if sliding_window is None or sliding_window <= 0:
-            sliding_window = 0
-
-        if alibi_slopes is not None:
-            _fwd_kernel_alibi[grid](
-                q,
-                k,
-                v,
-                k_cache,
-                v_cache,
-                b_loc,
-                sm_scale,
-                k_scale,
-                v_scale,
-                b_start_loc,
-                b_seq_len,
-                alibi_slopes,
-                v_cache.shape[3],
-                k_cache.shape[4],
-                o,
-                b_loc.stride(0),
-                b_loc.stride(1),
-                q.stride(0),
-                q.stride(1),
-                q.stride(2),
-                k.stride(0),
-                k.stride(1),
-                k.stride(2),
-                v.stride(0),
-                v.stride(1),
-                v.stride(2),
-                o.stride(0),
-                o.stride(1),
-                o.stride(2),
-                k_cache.stride(0),
-                k_cache.stride(1),
-                k_cache.stride(2),
-                k_cache.stride(3),
-                k_cache.stride(
-                    4
-                ),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
-                v_cache.stride(0),
-                v_cache.stride(1),
-                v_cache.stride(2),
-                v_cache.stride(
-                    3),  #[num_blocks, num_kv_heads, head_size, block_size]
-                num_queries_per_kv=num_queries_per_kv,
-                IN_PRECISION=IN_PRECISION,
-                BLOCK_M=BLOCK,
-                BLOCK_DMODEL=Lk,
-                BLOCK_DMODEL_PADDED=Lk_padded,
-                BLOCK_N=BLOCK,
-                SKIP_DECODE=skip_decode,
-                num_warps=NUM_WARPS,
-                num_stages=1,
-            )
-            return
-
-        _fwd_kernel[grid](
+        # batch, head,
+        grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
+        _fwd_kernel_alibi[grid](
             q,
             k,
             v,
@@ -852,6 +799,7 @@ def context_attention_fwd(q,
             v_scale,
             b_start_loc,
             b_seq_len,
+            alibi_slopes,
             v_cache.shape[3],
             k_cache.shape[4],
             o,
@@ -886,9 +834,69 @@ def context_attention_fwd(q,
             BLOCK_DMODEL=Lk,
             BLOCK_DMODEL_PADDED=Lk_padded,
             BLOCK_N=BLOCK,
-            SLIDING_WINDOW=sliding_window,
             SKIP_DECODE=skip_decode,
             num_warps=NUM_WARPS,
             num_stages=1,
         )
         return
+
+    max_seq_len = 0 if max_seq_len is None else max_seq_len
+    extra_kargs = {}
+    if current_platform.is_rocm():
+        extra_kargs = {"kpack": 2, "waves_per_eu": 2}
+
+    grid = lambda META: (batch, head,
+                         triton.cdiv(max_input_len, META["BLOCK_M"]))
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        b_loc,
+        sm_scale,
+        k_scale,
+        v_scale,
+        b_start_loc,
+        b_seq_len,
+        k_cache.shape[4],
+        o,
+        b_loc.stride(0),
+        b_loc.stride(1),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        o.stride(0),
+        o.stride(1),
+        o.stride(2),
+        k_cache.stride(0),
+        k_cache.stride(1),
+        k_cache.stride(2),
+        k_cache.stride(3),
+        k_cache.stride(
+            4),  #[num_blocks, num_kv_heads, head_size/x, block_size, x]
+        v_cache.stride(0),
+        v_cache.stride(1),
+        v_cache.stride(2),
+        v_cache.stride(3),  #[num_blocks, num_kv_heads, head_size, block_size]
+        BLOCK_SIZE=v_cache.shape[3],
+        num_queries_per_kv=num_queries_per_kv,
+        IN_PRECISION=IN_PRECISION,
+        BLOCK_DMODEL=Lk,
+        BLOCK_DMODEL_PADDED=Lk_padded,
+        SLIDING_WINDOW=sliding_window,
+        SKIP_DECODE=skip_decode,
+        BLOCK_M=128,
+        BLOCK_N=64,
+        num_unroll_cache=4,
+        num_unroll_request=1,
+        num_warps=4,
+        num_stages=1,
+        **extra_kargs)
+    return

From 1e013fa3887ccabcffea8cbe3714a8805aafaae4 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 22 Apr 2025 19:12:15 -0700
Subject: [PATCH 06/81] [V1][DP] More robust DP/EP dummy request coordination
 (#16277)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/test_async_llm_dp.py |  4 +-
 vllm/v1/engine/__init__.py    | 15 ++++++--
 vllm/v1/engine/core.py        | 63 +++++++++++++++++++++-----------
 vllm/v1/engine/core_client.py | 69 +++++++++++++++++++----------------
 4 files changed, 94 insertions(+), 57 deletions(-)

diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index f0e031969e73..ce4c4d198db5 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -101,9 +101,9 @@ async def test_load(output_kind: RequestOutputKind):
         # the engines only synchronize stopping every N steps so
         # allow a small amount of time here.
         for _ in range(10):
-            if core_client.num_engines_running == 0:
+            if not core_client.engines_running:
                 break
             await asyncio.sleep(0.5)
 
-        assert core_client.num_engines_running == 0
+        assert not core_client.engines_running
         assert not core_client.reqs_in_flight
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index af4122a51077..5f5675b955fa 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -61,6 +61,11 @@ class EngineCoreRequest(
     arrival_time: float
     lora_request: Optional[LoRARequest]
 
+    # Used in DP case to indicate which wave of requests this is expected to
+    # belong to, to cover a race condition where the request is sent before
+    # a wave finished notification is received.
+    current_wave: int = 0
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
@@ -139,8 +144,12 @@ class EngineCoreOutputs(
     utility_output: Optional[UtilityOutput] = None
     finished_requests: Optional[set[str]] = None
 
-    # In DP case, used to signal that the engine is paused.
-    engine_paused: bool = False
+    # In DP case, used to signal that the current wave of requests
+    # has finished and the engines are paused.
+    wave_complete: Optional[int] = None
+    # In DP case, used to signal that a request was received for an
+    # "old" wave, so the next wave needs to be started in other engines.
+    start_wave: Optional[int] = None
 
     def __post_init__(self):
         if self.timestamp == 0.0:
@@ -154,7 +163,7 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
-    START_DP = b'\x02'
+    START_DP_WAVE = b'\x02'
     UTILITY = b'\x03'
     # Sentinel used within EngineCoreProc.
     EXECUTOR_FAILED = b'\x04'
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 9c4036efd050..2211431fbceb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -325,7 +325,7 @@ def __init__(
 
         self.step_fn = (self.step if self.batch_queue is None else
                         self.step_with_batch_queue)
-        self.global_unfinished_reqs = False
+        self.engines_running = False
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
@@ -410,8 +410,7 @@ def _process_input_queue(self):
         """Exits when an engine step needs to be performed."""
 
         waited = False
-        while not self.global_unfinished_reqs and not (
-                self.scheduler.has_requests()):
+        while not self.engines_running and not (self.scheduler.has_requests()):
             if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                 logger.debug("EngineCore waiting for work.")
                 waited = True
@@ -419,10 +418,7 @@ def _process_input_queue(self):
             self._handle_client_request(*req)
 
         if waited:
-            logger.debug(
-                "EngineCore loop active - local unfinished: %s, finished: %s.",
-                self.scheduler.has_unfinished_requests(),
-                self.scheduler.has_finished_requests())
+            logger.debug("EngineCore loop active.")
 
         # Handle any more client requests.
         while not self.input_queue.empty():
@@ -446,10 +442,6 @@ def _handle_client_request(self, request_type: EngineCoreRequestType,
             self.add_request(request)
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
-        elif request_type == EngineCoreRequestType.START_DP:
-            if not self.global_unfinished_reqs:
-                logger.debug("EngineCore starting idle loop.")
-                self.global_unfinished_reqs = True
         elif request_type == EngineCoreRequestType.UTILITY:
             call_id, method_name, args = request
             output = UtilityOutput(call_id)
@@ -548,9 +540,6 @@ def process_output_socket(self, output_path: str, engine_index: int):
                 socket.send_multipart(buffers, copy=False)
 
 
-ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
-
-
 class DPEngineCoreProc(EngineCoreProc):
     """ZMQ-wrapper for running EngineCore in background process
     in a data parallel context."""
@@ -587,7 +576,9 @@ def __init__(
                 for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
                                tp_size))
 
+        self.local_dp_rank = local_dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+        self.current_wave = 0
 
         # Initialize the engine after setting up environment.
         super().__init__(input_path, output_path, vllm_config, executor_class,
@@ -602,6 +593,31 @@ def shutdown(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
+    def add_request(self, request: EngineCoreRequest):
+        if request.current_wave != self.current_wave:
+            if request.current_wave > self.current_wave:
+                self.current_wave = request.current_wave
+            elif not self.engines_running:
+                # Request received for an already-completed wave, notify
+                # front-end that we need to start the next one.
+                self.output_queue.put_nowait(
+                    EngineCoreOutputs(start_wave=self.current_wave))
+
+        super().add_request(request)
+
+    def _handle_client_request(self, request_type: EngineCoreRequestType,
+                               request: Any) -> None:
+        if request_type == EngineCoreRequestType.START_DP_WAVE:
+            new_wave: int = request
+            if new_wave >= self.current_wave:
+                self.current_wave = new_wave
+                if not self.engines_running:
+                    logger.debug("EngineCore starting idle loop for wave %d.",
+                                 new_wave)
+                    self.engines_running = True
+        else:
+            super()._handle_client_request(request_type, request)
+
     def run_busy_loop(self):
         """Core busy loop of the EngineCore for data parallel case."""
 
@@ -628,7 +644,7 @@ def run_busy_loop(self):
                     # up-to-date state is returned in the engine outputs.
                     self._process_engine_step()
 
-                if not self.global_unfinished_reqs:
+                if not self.engines_running:
                     # All engines are idle.
                     continue
 
@@ -637,18 +653,23 @@ def run_busy_loop(self):
                 self.execute_dummy_batch()
 
             # 3) All-reduce operation to determine global unfinished reqs.
-            self.global_unfinished_reqs = self._has_global_unfinished_reqs(
+            self.engines_running = self._has_global_unfinished_reqs(
                 local_unfinished_reqs)
 
-            if not self.global_unfinished_reqs:
-                # Notify client that we are pausing the loop.
-                self.output_queue.put_nowait(ENGINE_PAUSED_OUTPUTS)
+            if not self.engines_running:
+                if self.local_dp_rank == 0:
+                    # Notify client that we are pausing the loop.
+                    logger.debug("Wave %d finished, pausing engine loop.",
+                                 self.current_wave)
+                    self.output_queue.put_nowait(
+                        EngineCoreOutputs(wave_complete=self.current_wave))
+                self.current_wave += 1
 
     def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
 
-        # Optimization - only perform finish-sync all-reduce every 16 steps.
+        # Optimization - only perform finish-sync all-reduce every 24 steps.
         self.counter += 1
-        if self.counter != 16:
+        if self.counter != 24:
             return True
         self.counter = 0
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index f54b3546f06d..0efb5dfb39b7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -792,15 +792,12 @@ class DPAsyncMPClient(AsyncMPClient):
     def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
 
-        self.num_engines_running = 0
+        self.current_wave = 0
+        self.engines_running = False
         self.reqs_in_flight: dict[str, CoreEngine] = {}
 
         super().__init__(vllm_config, executor_class, log_stats)
 
-        # Control message used for triggering dp idle mode loop.
-        self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
-                             *self.encoder.encode(None))
-
         assert len(self.core_engines) > 1
 
     def _init_core_engines(
@@ -829,23 +826,23 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
         # NOTE: text prompt is not needed in the core engine as it has been
         # tokenized.
         request.prompt = None
-
-        msg = (EngineCoreRequestType.ADD.value, *self.encoder.encode(request))
+        request.current_wave = self.current_wave
 
         chosen_engine = self.get_core_engine_for_request()
         self.reqs_in_flight[request.request_id] = chosen_engine
         chosen_engine.num_reqs_in_flight += 1
-        if self.num_engines_running >= len(self.core_engines):
-            await self._send_input_message(msg, chosen_engine)
-        else:
+
+        to_await = self._send_input(EngineCoreRequestType.ADD, request,
+                                    chosen_engine)
+        if not self.engines_running:
             # Send request to chosen engine and dp start loop
             # control message to all other engines.
-            self.num_engines_running += len(self.core_engines)
-            await asyncio.gather(*[
-                self._send_input_message(
-                    msg if engine is chosen_engine else self.start_dp_msg,
-                    engine) for engine in self.core_engines
-            ])
+            self.engines_running = True
+            to_await = asyncio.gather(
+                to_await,  # type: ignore[assignment]
+                *self._start_wave_coros(exclude_index=chosen_engine.index))
+
+        await to_await
 
         self._ensure_output_queue_task()
 
@@ -860,21 +857,31 @@ async def process_engine_outputs(self: "DPAsyncMPClient",
                 if engine := self.reqs_in_flight.pop(req_id, None):
                     engine.num_reqs_in_flight -= 1
 
-        if outputs.engine_paused:
-            assert self.num_engines_running >= 1
-            self.num_engines_running -= 1
-            if not self.num_engines_running and self.reqs_in_flight:
-                # If there are requests in flight here, they must have
-                # been sent after the engines paused. We must make
-                # sure to start the other engines:
-                self.num_engines_running = len(self.core_engines)
-                coros = [
-                    self._send_input_message(self.start_dp_msg, engine)
-                    for engine in self.core_engines
-                    if not engine.num_reqs_in_flight
-                ]
-                if coros:
-                    await asyncio.gather(*coros)
+        if outputs.wave_complete is not None:
+            # Current wave is complete, move to next wave number
+            # and mark engines as paused.
+            if self.current_wave <= outputs.wave_complete:
+                self.current_wave = outputs.wave_complete + 1
+                self.engines_running = False
+
+        elif outputs.start_wave is not None and (
+                outputs.start_wave > self.current_wave or
+            (outputs.start_wave == self.current_wave
+             and not self.engines_running)):
+            # Engine received request for a non-current wave so we must ensure
+            # that other engines progress to the next wave.
+            self.current_wave = outputs.start_wave
+            self.engines_running = True
+            await asyncio.gather(*self._start_wave_coros(
+                exclude_index=outputs.engine_index))
+
+    def _start_wave_coros(self, exclude_index: int) -> list[Awaitable[None]]:
+        logger.debug("Sending start DP wave %d.", self.current_wave)
+        return [
+            self._send_input(EngineCoreRequestType.START_DP_WAVE,
+                             self.current_wave, engine)
+            for engine in self.core_engines if engine.index != exclude_index
+        ]
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         if not request_ids:

From 7e081ba7cad2cf5c98376135de781fc76bfc103c Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 23 Apr 2025 10:17:48 +0800
Subject: [PATCH 07/81] [BugFix] Revert ROCm Custom Paged Attention Env Flag
 Check (#17022)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/platforms/rocm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 24d8657af17d..944879b94ecd 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -118,6 +118,7 @@ def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
             and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
             and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
                      and envs.VLLM_ROCM_USE_AITER))
 

From 6bc1e30ef9ebaca4fe87c4406e60298b29d73ef9 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 23 Apr 2025 10:22:29 +0800
Subject: [PATCH 08/81] Revert "[Misc] Add S3 environment variables for better
 support of MinIO." (#17021)

---
 vllm/transformers_utils/s3_utils.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index f67b8fdd7c4e..1c3520bcfb27 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -45,12 +45,7 @@ def glob(s3=None,
         list[str]: List of full S3 paths allowed by the pattern
     """
     if s3 is None:
-        s3 = boto3.client(
-            's3',
-            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
-            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
-            endpoint_url=os.getenv("AWS_ENDPOINT_URL"),
-            region_name=os.getenv("AWS_REGION_NAME"))
+        s3 = boto3.client("s3")
     if not path.endswith("/"):
         path = path + "/"
     bucket_name, _, paths = list_files(s3,
@@ -112,12 +107,7 @@ class S3Model:
     """
 
     def __init__(self) -> None:
-        self.s3 = boto3.client(
-            's3',
-            aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
-            aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
-            endpoint_url=os.getenv("AWS_ENDPOINT_URL"),
-            region_name=os.getenv("AWS_REGION_NAME"))
+        self.s3 = boto3.client('s3')
         for sig in (signal.SIGINT, signal.SIGTERM):
             existing_handler = signal.getsignal(sig)
             signal.signal(sig, self._close_by_signal(existing_handler))

From e1cf90e09971ad1fdb340c49e5b008b7852bc07d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 23 Apr 2025 10:59:48 +0800
Subject: [PATCH 09/81] [misc] tune some env vars for GB200 (#16992)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/env_override.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/vllm/env_override.py b/vllm/env_override.py
index 0fa5b70c2ef9..71f031d1e231 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -8,8 +8,21 @@
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.
 
-# see https://github.com/NVIDIA/nccl/issues/1234
-os.environ['NCCL_CUMEM_ENABLE'] = '0'
+if not os.path.exists('/dev/nvidia-caps-imex-channels'):
+    # normally, we disable NCCL_CUMEM_ENABLE because it
+    # will cost 1~2 GiB GPU memory with cudagraph+allreduce,
+    # see https://github.com/NVIDIA/nccl/issues/1234
+    # for more details.
+    # However, NCCL requires NCCL_CUMEM_ENABLE to work with
+    # multi-node NVLink, typically on GB200-NVL72 systems.
+    # The ultimate way to detect multi-node NVLink is to use
+    # NVML APIs, which are too expensive to call here.
+    # As an approximation, we check the existence of
+    # /dev/nvidia-caps-imex-channels, used by
+    # multi-node NVLink to communicate across nodes.
+    # This will still cost some GPU memory, but it is worthwhile
+    # because we can get very fast cross-node bandwidth with NVLink.
+    os.environ['NCCL_CUMEM_ENABLE'] = '0'
 
 # see https://github.com/vllm-project/vllm/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()

From 56a735261c17d04529bca08d8245b1d383bae5f0 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Tue, 22 Apr 2025 22:14:11 -0500
Subject: [PATCH 10/81] [INTEL-HPU][v0] Port delayed sampling to upstream
 (#16949)

Signed-off-by: Michal Adamczyk <michal.adamczyk@intel.com>
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Co-authored-by: Michal Adamczyk <madamczyk@habana.ai>
---
 vllm/envs.py                    |   7 ++
 vllm/worker/hpu_model_runner.py | 140 ++++++++++++++++++++++++++++++--
 2 files changed, 140 insertions(+), 7 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 92dcf1555f22..03a8a2b20f02 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -98,6 +98,7 @@
     VLLM_RAY_BUNDLE_INDICES: str = ""
     VLLM_CUDART_SO_PATH: Optional[str] = None
     VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
+    VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
     VLLM_DP_RANK: int = 0
     VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
@@ -650,6 +651,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
     ("1", "true"),
 
+    # Use delayed sampling for HPU to reduce host cpu overhead
+    # between each step.
+    "VLLM_HPU_USE_DELAYED_SAMPLING":
+    lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
+    ("1", "true"),
+
     # Rank of the process in the data parallel setting
     "VLLM_DP_RANK":
     lambda: int(os.getenv("VLLM_DP_RANK", "0")),
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 7a346b34cef5..2d31024b47d0 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -74,6 +74,8 @@
 
 LORA_WARMUP_RANK = 8
 
+DUMMY_TOKEN_ID = -1
+
 
 class Singleton(type):
     _instances: Dict[type, object] = {}
@@ -668,6 +670,9 @@ def __init__(
 
         # For multi-step scheduling
         self.cached_step_outputs: List[torch.Tensor] = []
+        # For delayed sampling
+        self.cached_step_inputs: List[
+            ModelInputForHPUWithSamplingMetadata] = []
 
     def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
@@ -771,6 +776,12 @@ def load_model(self) -> None:
         msg = f"Loading model weights took in total {m.get_summary_string()}"
         logger.info(msg)
 
+    def _maybe_wrap_in_hpu_graph(self, *args, **kwargs):
+        return htorch.hpu.wrap_in_hpu_graph(
+            HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True
+        ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(
+            *args, **kwargs)
+
     def get_model(self) -> nn.Module:
         return self.model
 
@@ -2020,6 +2031,21 @@ def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int],
 
         return lora_mask, lora_logits_mask
 
+    def _get_seq_ids(self, model_input):
+        return ([
+            sg.seq_ids[0] for sg in model_input.sampling_metadata.seq_groups
+        ])
+
+    def _pad_to_max_num_seqs(self, tensor, value):
+        padding_needed = self.max_num_seqs - tensor.size(0)
+        if padding_needed:
+            padding = torch.full((padding_needed, *tensor.shape[1:]),
+                                 value,
+                                 device=tensor.device,
+                                 dtype=tensor.dtype)
+            tensor = torch.cat([tensor, padding])
+        return tensor
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -2030,6 +2056,37 @@ def execute_model(
         warmup_mode=False,
         seqs=None,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+        VLLM_DELAYED_SAMPLING = envs.VLLM_HPU_USE_DELAYED_SAMPLING
+        use_delayed_sampling = VLLM_DELAYED_SAMPLING and not warmup_mode
+        assert not (use_delayed_sampling and num_steps != 1), \
+            'Delayed sampling is not compatible with MSS!'
+        assert model_input.input_tokens is not None
+        if use_delayed_sampling and not model_input.is_prompt and \
+                self.is_driver_worker:
+            num_cached = len(self.cached_step_outputs)
+            assert num_cached > 0
+            cur_seq_ids = self._get_seq_ids(model_input)
+            cur_seq_id_pos = {
+                sid: idx
+                for idx, sid in enumerate(cur_seq_ids) if sid >= 0
+            }
+            htorch.core.mark_step()
+            for i in range(num_cached):
+                prev_seq_ids = self._get_seq_ids(self.cached_step_inputs[i])
+                target_indices = [
+                    cur_seq_id_pos.get(psi, -1) for psi in prev_seq_ids
+                ]
+                padding = self.cached_step_outputs[i].size(0) - len(
+                    target_indices)
+                target_indices.extend([-1] * padding)
+                target_indices = torch.tensor(
+                    target_indices,
+                    device=model_input.input_tokens.device,
+                    dtype=model_input.input_tokens.dtype)
+                model_input.input_tokens.index_copy_(
+                    0, target_indices, self.cached_step_outputs[i])
+                htorch.core.mark_step()
+
         if not model_input.is_first_multi_step:
             if not model_input.is_last_step:
                 # not first or last multi-step
@@ -2045,7 +2102,21 @@ def execute_model(
                 assert model_input.lora_mapping is not None
                 self.set_active_loras(model_input.lora_requests,
                                       model_input.lora_mapping)
-            input_tokens = model_input.input_tokens
+            # Rank!=0 workers has is_prompt==None
+            if use_delayed_sampling and not model_input.is_prompt and \
+                    model_input.input_tokens.size(1) == 1:
+                if self.is_driver_worker:
+                    model_kwargs_broadcast_data = {
+                        "input_tokens": model_input.input_tokens
+                    }
+                    broadcast_tensor_dict(model_kwargs_broadcast_data, src=0)
+                    input_tokens = model_input.input_tokens
+
+                else:
+                    model_kwargs_broadcast_data = broadcast_tensor_dict(src=0)
+                    input_tokens = model_kwargs_broadcast_data["input_tokens"]
+            else:
+                input_tokens = model_input.input_tokens
             input_positions = model_input.input_positions
             attn_metadata = model_input.attn_metadata
             sampling_metadata = model_input.sampling_metadata
@@ -2092,7 +2163,7 @@ def execute_model(
                                     f"graphs{'T' if use_graphs else 'F'}")
             else:
                 model_event_name = 'model_executable'
-            if num_steps > 1:
+            if num_steps > 1 or use_delayed_sampling:
                 # in case of multi-step scheduling
                 # we only want to pythonize in the last step
                 sampling_metadata.skip_sampler_cpu_output = True
@@ -2152,9 +2223,9 @@ def try_revert_dummy_output_tokens():
                 if not self.is_driver_worker:
                     continue
 
-                if model_input.async_callback is not None:
-                    model_input.async_callback()
-                # Sample the next token.
+                if use_delayed_sampling:
+                    fake_output = self._delayed_sampler_outputs(model_input)
+
                 with self.profiler.record_event(
                         'internal', ('sample_'
                                      f'{"prompt" if is_prompt else "decode"}_'
@@ -2166,9 +2237,16 @@ def try_revert_dummy_output_tokens():
                     )
                     if num_steps > 1:
                         output = output.sampled_token_ids
-                        self.cached_step_outputs.append(
-                            output.detach().clone())
+                        self.cached_step_outputs.append(output)
+                    if use_delayed_sampling and self.is_driver_worker:
+                        self._patch_prev_output()
+                        output = self._pad_to_max_num_seqs(
+                            output.sampled_token_ids, DUMMY_TOKEN_ID)
+                        self.cached_step_outputs.append(output)
+                        self.cached_step_inputs.append(model_input)
                 htorch.core.mark_step()
+                if model_input.async_callback is not None:
+                    model_input.async_callback()
                 if i < num_steps - 1:
                     if i == 0:
                         if model_input.async_callback is not None:
@@ -2241,11 +2319,30 @@ def try_revert_dummy_output_tokens():
                     is_prompt=is_prompt)
                 self.profiler.record_counter(self.event_start, counters)
             if num_steps == 1:
+                if self.return_hidden_states:
+                    # we only need to pass hidden states of most recent token
+                    assert model_input.sampling_metadata is not None
+                    if model_input.is_prompt:
+                        output.prefill_hidden_states = hidden_states
+                    output.hidden_states = hidden_states
+                if use_delayed_sampling:
+                    if self.is_driver_worker:
+                        return [fake_output]
+                    else:
+                        return []
+
                 return [output] if self.is_driver_worker else []
             else:
                 return []
         return output if type(output) is list else [output]
 
+    def _delayed_sampler_outputs(self, model_input):
+        next_token_ids = [[DUMMY_TOKEN_ID]] * len(
+            model_input.sampling_metadata.seq_groups)
+        sampler_output = self._make_decode_output(
+            next_token_ids, model_input.sampling_metadata.seq_groups)
+        return sampler_output
+
     def _decode_sampler_outputs(self, model_input):
         use_async_out_proc = model_input.async_callback is not None
         sampler_outputs = []
@@ -2312,3 +2409,32 @@ def shutdown_inc(self):
 
     def __del__(self):
         self.shutdown_inc()
+
+    def _patch_prev_output(self):
+        assert len(self.cached_step_inputs) == len(self.cached_step_outputs), \
+            f'''Inputs and outputs are out of sync!
+            {len(self.cached_step_inputs)} vs {len(self.cached_step_outputs)}'''
+        if len(self.cached_step_inputs) == 0:
+            return
+        model_input = self.cached_step_inputs.pop(0)
+        delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(
+            -1).tolist()
+        ctx = model_input.async_callback.keywords["ctx"]  # type: ignore
+        # If there's no output to patch with, which is usually the case when
+        # we're starting a new request after all requests are completed.
+        if len(ctx.output_queue) == 0:
+            return
+        assert len(
+            ctx.output_queue) == 1, 'There should be exactly 1 output waiting!'
+        output_data = ctx.output_queue[0]
+        assert len(output_data.outputs) == 1
+        for fake_out, real_out in zip(output_data.outputs[0], delayed_output):
+            fake_out.samples[0].output_token = real_out
+        for sg, real_out in zip(output_data.seq_group_metadata_list,
+                                delayed_output):
+            assert len(sg.seq_data) == 1
+            seq_data = list(sg.seq_data.values())[0]
+            # This is a hack. Assigning output_token_ids triggers
+            # a cache recomputation and we only need to update the last token
+            seq_data.output_token_ids_array[-1] = real_out
+            seq_data._cached_all_token_ids[-1] = real_out

From eb8ef4224da757bf8ea1c738f9514601835f8bbb Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Wed, 23 Apr 2025 12:06:30 +0800
Subject: [PATCH 11/81] [doc] add download path tips (#17013)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 331f18db817d..0fdffbeefd03 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -133,7 +133,7 @@ class MyConfig(PretrainedConfig):
 
 ### Hugging Face Hub
 
-By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
 
 To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.

From 047797ef904fd77b13af3a972e92ef6de37a36db Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 23 Apr 2025 12:35:24 +0800
Subject: [PATCH 12/81] [Bugfix] Triton FA function takes no keyword arguments
 (#16902)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/attention/backends/mla/common.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 2517a5971838..a3dec0dbda9f 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1091,7 +1091,14 @@ def _flash_attn_varlen_diff_headdims(self, q, k, v, softmax_scale,
                 q,
                 k,
                 maybe_padded_v,
-                **kwargs,
+                None,  # output
+                kwargs["cu_seqlens_q"],
+                kwargs["cu_seqlens_k"],
+                kwargs["max_seqlen_q"],
+                kwargs["max_seqlen_k"],
+                kwargs["causal"],
+                softmax_scale,
+                None,  # bias
             )
         if is_vllm_fa:
             attn_out = self.flash_attn_varlen_func(

From b2f195c4294ce72a97d4fdc6ca672ed85b916bd8 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 22 Apr 2025 21:36:29 -0700
Subject: [PATCH 13/81] [V1] Avoid socket errors during shutdown when requests
 are in in-flight (#16807)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core.py        |  2 +-
 vllm/v1/engine/core_client.py | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 2211431fbceb..572e052cdcc2 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -384,7 +384,7 @@ def signal_handler(signum, frame):
 
         except SystemExit:
             logger.debug("EngineCore exiting.")
-
+            raise
         except Exception as e:
             if engine_core is None:
                 logger.exception("EngineCore failed to start.")
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 0efb5dfb39b7..a2727d995e7d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -312,6 +312,7 @@ class BackgroundResources:
     def __call__(self):
         """Clean up background resources."""
 
+        self.engine_dead = True
         for core_engine in self.core_engines:
             core_engine.close()
 
@@ -564,7 +565,7 @@ def add_request(self, request: EngineCoreRequest) -> None:
         self._send_input(EngineCoreRequestType.ADD, request)
 
     def abort_requests(self, request_ids: list[str]) -> None:
-        if len(request_ids) > 0:
+        if request_ids and not self.resources.engine_dead:
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
@@ -735,7 +736,7 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
         self._ensure_output_queue_task()
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
-        if len(request_ids) > 0:
+        if request_ids and not self.resources.engine_dead:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     async def profile_async(self, is_start: bool = True) -> None:
@@ -902,5 +903,6 @@ async def abort_requests_async(self, request_ids: list[str]) -> None:
 
     async def _abort_requests(self, request_ids: list[str],
                               engine: CoreEngine) -> None:
-        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
-                               engine)
+        if not self.resources.engine_dead:
+            await self._send_input(EngineCoreRequestType.ABORT, request_ids,
+                                   engine)

From d0da99fb70ba68784575dc75a10ebf05afef0e5c Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 23 Apr 2025 00:49:24 -0400
Subject: [PATCH 14/81] [BugFix] llama4 fa3 fix - RuntimeError:
 scheduler_metadata must have shape (metadata_size) (#16998)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 vllm/v1/attention/backends/flash_attn.py | 76 +++++++++++++++---------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index dd6021468ac8..51ae386d3389 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -105,6 +105,7 @@ class LocalAttentionMetadata:
         local_block_table: torch.Tensor
         local_max_query_len: int
         local_max_seq_len: int
+        local_scheduler_metadata: Optional[torch.Tensor]
 
     local_attn_metadata: Optional[LocalAttentionMetadata] = None
 
@@ -282,7 +283,9 @@ def __init__(self, runner: "GPUModelRunner"):
 
         self.runner = runner
         self.aot_schedule = (get_flash_attn_version() == 3)
-        self.num_heads = model_config.get_num_attention_heads(
+        self.num_heads_q = model_config.get_num_attention_heads(
+            runner.parallel_config)
+        self.num_heads_kv = model_config.get_num_kv_heads(
             runner.parallel_config)
         self.headdim = model_config.get_head_size()
         self.page_size = self.runner.block_size
@@ -304,6 +307,23 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             self.runner.device, non_blocking=True).long()
 
+        def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
+                     max_seq_len, causal):
+            if self.aot_schedule:
+                return get_scheduler_metadata(
+                    batch_size=batch_size,
+                    max_seqlen_q=max_query_len,
+                    max_seqlen_k=max_seq_len,
+                    cache_seqlens=seqlens,
+                    num_heads_q=self.num_heads_q,
+                    num_heads_kv=self.num_heads_kv,
+                    headdim=self.headdim,
+                    page_size=self.page_size,
+                    cu_seqlens_q=cu_query_lens,
+                    causal=causal,
+                )
+            return None
+
         # for local attention
         local_attn_metadata = None
         if self.runner.attention_chunk_size is not None:
@@ -315,36 +335,31 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
                     block_table,
                     self.runner.block_size,
                 )
+            local_query_start_loc = torch.from_numpy(virt_q_cu_seqlens_np).to(
+                self.runner.device, non_blocking=True)
+            local_seqused_k = torch.from_numpy(virt_k_seqlens_np).to(
+                self.runner.device, non_blocking=True)
+            local_max_query_len = seqlens_q_local_np.max()
+            local_max_seq_len = virt_k_seqlens_np.max()
+            local_scheduler_metadata = schedule(
+                batch_size=local_query_start_loc.shape[0] - 1,
+                cu_query_lens=local_query_start_loc,
+                max_query_len=local_max_query_len,
+                seqlens=local_seqused_k,
+                max_seq_len=local_max_seq_len,
+                causal=True)
+
             local_attn_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
-                local_query_start_loc=torch.from_numpy(
-                    virt_q_cu_seqlens_np).to(self.runner.device,
-                                             non_blocking=True),
-                local_seqused_k=torch.from_numpy(virt_k_seqlens_np).to(
-                    self.runner.device, non_blocking=True),
+                local_query_start_loc=local_query_start_loc,
+                local_seqused_k=local_seqused_k,
                 local_block_table=virt_block_table,
-                local_max_query_len=seqlens_q_local_np.max(),
-                local_max_seq_len=virt_k_seqlens_np.max(),
+                local_max_query_len=local_max_query_len,
+                local_max_seq_len=local_max_seq_len,
+                local_scheduler_metadata=local_scheduler_metadata,
             )
 
         use_cascade = common_prefix_len > 0
 
-        def schedule(cu_query_lens, max_query_len, seqlens, max_seq_len,
-                     causal):
-            if self.aot_schedule:
-                return get_scheduler_metadata(
-                    batch_size=num_reqs,
-                    max_seqlen_q=max_query_len,
-                    max_seqlen_k=max_seq_len,
-                    cache_seqlens=seqlens,
-                    num_heads_q=self.num_heads,
-                    num_heads_kv=self.num_heads,
-                    headdim=self.headdim,
-                    page_size=self.page_size,
-                    cu_seqlens_q=cu_query_lens,
-                    causal=causal,
-                )
-            return None
-
         if use_cascade:
             cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
                                                 dtype=torch.int32,
@@ -357,12 +372,14 @@ def schedule(cu_query_lens, max_query_len, seqlens, max_seq_len,
             suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(
                 self.runner.device)
             prefix_scheduler_metadata = schedule(
+                batch_size=num_reqs,
                 cu_query_lens=cu_prefix_query_lens,
                 max_query_len=num_actual_tokens,
                 seqlens=prefix_kv_lens,
                 max_seq_len=common_prefix_len,
                 causal=False)
-            scheduler_metadata = schedule(cu_query_lens=query_start_loc,
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
                                           max_query_len=max_query_len,
                                           seqlens=suffix_kv_lens,
                                           max_seq_len=max_seq_len -
@@ -373,7 +390,8 @@ def schedule(cu_query_lens, max_query_len, seqlens, max_seq_len,
             prefix_kv_lens = None
             suffix_kv_lens = None
             prefix_scheduler_metadata = None
-            scheduler_metadata = schedule(cu_query_lens=query_start_loc,
+            scheduler_metadata = schedule(batch_size=num_reqs,
+                                          cu_query_lens=query_start_loc,
                                           max_query_len=max_query_len,
                                           seqlens=seq_lens,
                                           max_seq_len=max_seq_len,
@@ -540,12 +558,14 @@ def forward(
                 max_seqlen_q = local_metadata.local_max_query_len
                 max_seqlen_k = local_metadata.local_max_seq_len
                 block_table = local_metadata.local_block_table
+                scheduler_metadata = local_metadata.local_scheduler_metadata
             else:
                 cu_seqlens_q = attn_metadata.query_start_loc
                 seqused_k = attn_metadata.seq_lens
                 max_seqlen_q = attn_metadata.max_query_len
                 max_seqlen_k = attn_metadata.max_seq_len
                 block_table = attn_metadata.block_table
+                scheduler_metadata = attn_metadata.scheduler_metadata
 
             descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
 
@@ -564,7 +584,7 @@ def forward(
                 window_size=self.sliding_window,
                 block_table=block_table,
                 softcap=self.logits_soft_cap,
-                scheduler_metadata=attn_metadata.scheduler_metadata,
+                scheduler_metadata=scheduler_metadata,
                 fa_version=self.vllm_flash_attn_version,
                 q_descale=layer._q_scale.expand(descale_shape),
                 k_descale=layer._k_scale.expand(descale_shape),

From ec69124eb4729840d0bfbfa4f84608d0f48ff9a5 Mon Sep 17 00:00:00 2001
From: huafeng <qidizou88@gmail.com>
Date: Wed, 23 Apr 2025 14:16:53 +0800
Subject: [PATCH 15/81] [Misc] Improve readability of get_open_port function.
 (#17024)

Signed-off-by: gitover22 <qidizou88@gmail.com>
---
 vllm/utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index c6e2afff72d7..c65a370bd530 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -628,12 +628,12 @@ def get_open_port() -> int:
     process. Currently it uses 2 ports.
     """
     if "VLLM_DP_MASTER_PORT" in os.environ:
-        dp_port = envs.VLLM_DP_MASTER_PORT
+        dp_master_port = envs.VLLM_DP_MASTER_PORT
+        reserved_port_range = range(dp_master_port, dp_master_port + 10)
         while True:
-            port = _get_open_port()
-            if dp_port <= port < dp_port + 10:
-                continue
-            return port
+            candidate_port = _get_open_port()
+            if candidate_port not in reserved_port_range:
+                return candidate_port
     return _get_open_port()
 
 

From 8c87a9ad46dd8b972d4cd9c6cecb5b284c92f583 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 23 Apr 2025 15:24:09 +0800
Subject: [PATCH 16/81] [Bugfix] Fix AssertionError: skip_special_tokens=False
 is not supported for Mistral tokenizers (#16964)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
---
 .../openai/tool_parsers/mistral_tool_parser.py       | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index bff6cb79ad53..f0000daa0a41 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -72,10 +72,14 @@ def __init__(self, tokenizer: AnyTokenizer):
 
     def adjust_request(
             self, request: ChatCompletionRequest) -> ChatCompletionRequest:
-        if request.tools and request.tool_choice != 'none':
-            # do not skip special tokens because mistral uses the special
-            # tokens to indicate the start and end of the tool calls
-            # information.
+        if not isinstance(
+                self.model_tokenizer, MistralTokenizer
+        ) and request.tools and request.tool_choice != 'none':
+            # Do not skip special tokens when using chat template
+            # with Mistral parser as TOOL_CALL token is needed
+            # for tool detection.
+            # Note: we don't want skip_special_tokens=False
+            # with MistralTokenizer as it is incompatible
             request.skip_special_tokens = False
         return request
 

From ce17db80851e56c053316cebddee923b440b4d8a Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 23 Apr 2025 04:13:34 -0400
Subject: [PATCH 17/81] [CI] Run v1/test_serial_utils.py in CI (#16996)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 95e38305e1e1..2420b2d5d71b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -209,6 +209,7 @@ steps:
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
     - pytest -v -s v1/spec_decode
+    - pytest -v -s v1/test_serial_utils.py
     - pytest -v -s v1/test_stats.py
     - pytest -v -s v1/test_utils.py
     - pytest -v -s v1/test_oracle.py

From aa72d9a4ea6b31a845bf4fbd5a97d3175a8c329a Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 23 Apr 2025 06:46:23 -0600
Subject: [PATCH 18/81] Mistral-format support for compressed-tensors (#16803)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/transformers_utils/config.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 178fdd63a872..4e2a31ce6729 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -690,6 +690,9 @@ def recurse_elems(elem: Any):
                 "quant_method": "fp8",
                 "activation_scheme": "static"
             }
+        elif quantization.get("quant_method") == "compressed-tensors":
+            # Pass through the quantization config to compressed-tensors
+            quantization_config = quantization
         else:
             raise ValueError(
                 f"Found unknown quantization='{quantization}' in config")
@@ -707,6 +710,7 @@ def recurse_elems(elem: Any):
 
     if config_type == "multimodal":
         multimodal_config = config_dict.pop("vision_encoder")
+        quantization_config = config_dict.get("quantization_config", {})
 
         config_dict = {
             "text_config": config_dict,
@@ -714,6 +718,8 @@ def recurse_elems(elem: Any):
         }
         config_dict["architectures"] = ["PixtralForConditionalGeneration"]
         config_dict["model_type"] = "pixtral"
+        if quantization_config:
+            config_dict["quantization_config"] = quantization_config
 
     config_dict.update(kwargs)
 

From 6317a5174a4f3cbd57c44d15023042cecc576f9e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 23 Apr 2025 07:21:07 -0600
Subject: [PATCH 19/81] Categorize `tests/kernels/` based on kernel type
 (#16799)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../test_lm_eval_correctness.py               |  2 +-
 .buildkite/test-pipeline.yaml                 | 41 ++++++++++++++++---
 tests/kernels/{ => attention}/conftest.py     |  0
 .../kernels/{ => attention}/test_attention.py |  3 +-
 .../test_attention_selector.py                |  9 ++++
 .../test_blocksparse_attention.py             |  3 +-
 tests/kernels/{ => attention}/test_cache.py   |  0
 .../test_cascade_flash_attn.py                |  0
 .../test_encoder_decoder_attn.py              |  0
 .../{ => attention}/test_flash_attn.py        |  0
 .../{ => attention}/test_flashinfer.py        |  0
 .../kernels/{ => attention}/test_flashmla.py  |  0
 .../{ => attention}/test_lightning_attn.py    |  0
 .../{ => attention}/test_merge_attn_states.py |  0
 .../kernels/{ => attention}/test_mha_attn.py  |  0
 .../{ => attention}/test_mla_decode_cpu.py    |  0
 .../{ => attention}/test_prefix_prefill.py    |  0
 .../test_rocm_attention_selector.py           |  0
 .../test_triton_decode_attention.py           |  0
 tests/kernels/{ => core}/test_activation.py   |  3 +-
 .../{ => core}/test_fused_quant_layernorm.py  |  0
 tests/kernels/{ => core}/test_layernorm.py    |  0
 tests/kernels/core/test_opcheck.py            | 25 +++++++++++
 tests/kernels/{ => core}/test_permute_cols.py |  0
 tests/kernels/{ => core}/test_pos_encoding.py |  3 +-
 .../{ => core}/test_rotary_embedding.py       |  0
 tests/kernels/{ => core}/test_uva.py          |  0
 .../kernels/{ => mamba}/test_causal_conv1d.py |  0
 .../kernels/{ => mamba}/test_mamba_mixer2.py  |  0
 tests/kernels/{ => mamba}/test_mamba_ssm.py   |  0
 .../kernels/{ => mamba}/test_mamba_ssm_ssd.py |  0
 tests/kernels/{ => moe}/test_cutlass_moe.py   |  0
 tests/kernels/{ => moe}/test_moe.py           |  0
 .../{ => moe}/test_triton_moe_ptpc_fp8.py     |  0
 .../{ => quantization}/test_allspark_gemm.py  |  0
 tests/kernels/{ => quantization}/test_aqlm.py |  0
 tests/kernels/{ => quantization}/test_awq.py  |  0
 .../{ => quantization}/test_awq_marlin.py     |  0
 .../{ => quantization}/test_awq_triton.py     |  0
 .../{ => quantization}/test_block_fp8.py      |  3 +-
 .../{ => quantization}/test_block_int8.py     |  3 +-
 .../test_cutlass_2of4_sparse.py               |  3 +-
 .../test_cutlass_scaled_mm.py}                |  4 +-
 .../{ => quantization}/test_fp8_quant.py      |  0
 tests/kernels/{ => quantization}/test_ggml.py |  0
 tests/kernels/{ => quantization}/test_gguf.py |  0
 tests/kernels/{ => quantization}/test_gptq.py |  0
 .../{ => quantization}/test_int8_kernel.py    |  0
 .../{ => quantization}/test_int8_quant.py     |  0
 .../{ => quantization}/test_machete_mm.py     |  0
 .../{ => quantization}/test_marlin_gemm.py    |  0
 .../{ => quantization}/test_nvfp4_quant.py    |  0
 .../test_nvfp4_scaled_mm.py                   |  0
 .../test_triton_scaled_mm.py                  |  0
 tests/kernels/test_utils.py                   | 25 -----------
 55 files changed, 79 insertions(+), 48 deletions(-)
 rename tests/kernels/{ => attention}/conftest.py (100%)
 rename tests/kernels/{ => attention}/test_attention.py (99%)
 rename tests/kernels/{ => attention}/test_attention_selector.py (95%)
 rename tests/kernels/{ => attention}/test_blocksparse_attention.py (99%)
 rename tests/kernels/{ => attention}/test_cache.py (100%)
 rename tests/kernels/{ => attention}/test_cascade_flash_attn.py (100%)
 rename tests/kernels/{ => attention}/test_encoder_decoder_attn.py (100%)
 rename tests/kernels/{ => attention}/test_flash_attn.py (100%)
 rename tests/kernels/{ => attention}/test_flashinfer.py (100%)
 rename tests/kernels/{ => attention}/test_flashmla.py (100%)
 rename tests/kernels/{ => attention}/test_lightning_attn.py (100%)
 rename tests/kernels/{ => attention}/test_merge_attn_states.py (100%)
 rename tests/kernels/{ => attention}/test_mha_attn.py (100%)
 rename tests/kernels/{ => attention}/test_mla_decode_cpu.py (100%)
 rename tests/kernels/{ => attention}/test_prefix_prefill.py (100%)
 rename tests/kernels/{ => attention}/test_rocm_attention_selector.py (100%)
 rename tests/kernels/{ => attention}/test_triton_decode_attention.py (100%)
 rename tests/kernels/{ => core}/test_activation.py (97%)
 rename tests/kernels/{ => core}/test_fused_quant_layernorm.py (100%)
 rename tests/kernels/{ => core}/test_layernorm.py (100%)
 create mode 100644 tests/kernels/core/test_opcheck.py
 rename tests/kernels/{ => core}/test_permute_cols.py (100%)
 rename tests/kernels/{ => core}/test_pos_encoding.py (99%)
 rename tests/kernels/{ => core}/test_rotary_embedding.py (100%)
 rename tests/kernels/{ => core}/test_uva.py (100%)
 rename tests/kernels/{ => mamba}/test_causal_conv1d.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_mixer2.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_ssm.py (100%)
 rename tests/kernels/{ => mamba}/test_mamba_ssm_ssd.py (100%)
 rename tests/kernels/{ => moe}/test_cutlass_moe.py (100%)
 rename tests/kernels/{ => moe}/test_moe.py (100%)
 rename tests/kernels/{ => moe}/test_triton_moe_ptpc_fp8.py (100%)
 rename tests/kernels/{ => quantization}/test_allspark_gemm.py (100%)
 rename tests/kernels/{ => quantization}/test_aqlm.py (100%)
 rename tests/kernels/{ => quantization}/test_awq.py (100%)
 rename tests/kernels/{ => quantization}/test_awq_marlin.py (100%)
 rename tests/kernels/{ => quantization}/test_awq_triton.py (100%)
 rename tests/kernels/{ => quantization}/test_block_fp8.py (99%)
 rename tests/kernels/{ => quantization}/test_block_int8.py (99%)
 rename tests/kernels/{ => quantization}/test_cutlass_2of4_sparse.py (99%)
 rename tests/kernels/{test_cutlass.py => quantization/test_cutlass_scaled_mm.py} (99%)
 rename tests/kernels/{ => quantization}/test_fp8_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_ggml.py (100%)
 rename tests/kernels/{ => quantization}/test_gguf.py (100%)
 rename tests/kernels/{ => quantization}/test_gptq.py (100%)
 rename tests/kernels/{ => quantization}/test_int8_kernel.py (100%)
 rename tests/kernels/{ => quantization}/test_int8_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_machete_mm.py (100%)
 rename tests/kernels/{ => quantization}/test_marlin_gemm.py (100%)
 rename tests/kernels/{ => quantization}/test_nvfp4_quant.py (100%)
 rename tests/kernels/{ => quantization}/test_nvfp4_scaled_mm.py (100%)
 rename tests/kernels/{ => quantization}/test_triton_scaled_mm.py (100%)
 delete mode 100644 tests/kernels/test_utils.py

diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 4ae23eff62f3..6015a83e8295 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -16,7 +16,7 @@
 import pytest
 import yaml
 
-RTOL = 0.05
+RTOL = 0.08
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2420b2d5d71b..ec00bc7f108d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -317,15 +317,46 @@ steps:
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+- label: Kernels Core Operation Test
   source_file_dependencies:
   - csrc/
+  - tests/kernels/core
+  commands:
+    - pytest -v -s kernels/core
+
+- label: Kernels Attention Test %N
+  source_file_dependencies:
+  - csrc/attention/
   - vllm/attention
-  - tests/kernels
+  - vllm/v1/attention
+  - tests/kernels/attention
   commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels Quantization Test %N
+  source_file_dependencies:
+  - csrc/quantization/
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization
+  commands:
+    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 2
+
+- label: Kernels MoE Test
+  source_file_dependencies:
+  - csrc/moe/
+  - tests/kernels/moe
+  - vllm/model_executor/layers/fused_moe/
+  commands:
+    - pytest -v -s kernels/moe
+
+- label: Kernels Mamba Test
+  source_file_dependencies:
+  - csrc/mamba/
+  - tests/kernels/mamba
+  commands:
+    - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
   # mirror_hardwares: [amd]
diff --git a/tests/kernels/conftest.py b/tests/kernels/attention/conftest.py
similarity index 100%
rename from tests/kernels/conftest.py
rename to tests/kernels/attention/conftest.py
diff --git a/tests/kernels/test_attention.py b/tests/kernels/attention/test_attention.py
similarity index 99%
rename from tests/kernels/test_attention.py
rename to tests/kernels/attention/test_attention.py
index 0d7898a900e4..e5650136f258 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -6,13 +6,12 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 if not current_platform.is_rocm():
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
similarity index 95%
rename from tests/kernels/test_attention_selector.py
rename to tests/kernels/attention/test_attention_selector.py
index 2b5e0a29ddc5..b0414244c215 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -156,6 +156,15 @@ def test_env(
                         expected = ("TRITON_MLA_VLLM_V1"
                                     if use_v1 else "TRITON_MLA")
                         assert backend.get_name() == expected
+                elif name == "FLASHINFER":
+                    backend = get_attn_backend(16,
+                                               torch.float16,
+                                               torch.float16,
+                                               block_size,
+                                               False,
+                                               use_mla=use_mla)
+                    expected = "FLASHINFER_VLLM_V1" if use_v1 else name
+                    assert backend.get_name() == expected
                 else:
                     backend = get_attn_backend(16,
                                                torch.float16,
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
similarity index 99%
rename from tests/kernels/test_blocksparse_attention.py
rename to tests/kernels/attention/test_blocksparse_attention.py
index 3025ae0f921a..82d038257575 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/attention/test_blocksparse_attention.py
@@ -6,14 +6,13 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm import _custom_ops as ops
 from vllm.attention.ops.blocksparse_attention.interface import (
     LocalStridedBlockSparseAttn)
 from vllm.platforms import current_platform
 from vllm.utils import get_max_shared_memory_bytes
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
diff --git a/tests/kernels/test_cache.py b/tests/kernels/attention/test_cache.py
similarity index 100%
rename from tests/kernels/test_cache.py
rename to tests/kernels/attention/test_cache.py
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
similarity index 100%
rename from tests/kernels/test_cascade_flash_attn.py
rename to tests/kernels/attention/test_cascade_flash_attn.py
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py
similarity index 100%
rename from tests/kernels/test_encoder_decoder_attn.py
rename to tests/kernels/attention/test_encoder_decoder_attn.py
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
similarity index 100%
rename from tests/kernels/test_flash_attn.py
rename to tests/kernels/attention/test_flash_attn.py
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
similarity index 100%
rename from tests/kernels/test_flashinfer.py
rename to tests/kernels/attention/test_flashinfer.py
diff --git a/tests/kernels/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
similarity index 100%
rename from tests/kernels/test_flashmla.py
rename to tests/kernels/attention/test_flashmla.py
diff --git a/tests/kernels/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py
similarity index 100%
rename from tests/kernels/test_lightning_attn.py
rename to tests/kernels/attention/test_lightning_attn.py
diff --git a/tests/kernels/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
similarity index 100%
rename from tests/kernels/test_merge_attn_states.py
rename to tests/kernels/attention/test_merge_attn_states.py
diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
similarity index 100%
rename from tests/kernels/test_mha_attn.py
rename to tests/kernels/attention/test_mha_attn.py
diff --git a/tests/kernels/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py
similarity index 100%
rename from tests/kernels/test_mla_decode_cpu.py
rename to tests/kernels/attention/test_mla_decode_cpu.py
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
similarity index 100%
rename from tests/kernels/test_prefix_prefill.py
rename to tests/kernels/attention/test_prefix_prefill.py
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
similarity index 100%
rename from tests/kernels/test_rocm_attention_selector.py
rename to tests/kernels/attention/test_rocm_attention_selector.py
diff --git a/tests/kernels/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py
similarity index 100%
rename from tests/kernels/test_triton_decode_attention.py
rename to tests/kernels/attention/test_triton_decode_attention.py
diff --git a/tests/kernels/test_activation.py b/tests/kernels/core/test_activation.py
similarity index 97%
rename from tests/kernels/test_activation.py
rename to tests/kernels/core/test_activation.py
index cf0f21ce0651..79f838a954e7 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -5,6 +5,7 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul,
                                                    GeluAndMul, MulAndSilu,
@@ -12,8 +13,6 @@
                                                    SiluAndMul)
 from vllm.platforms import current_platform
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 13824]  # Arbitrary values for testing
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
similarity index 100%
rename from tests/kernels/test_fused_quant_layernorm.py
rename to tests/kernels/core/test_fused_quant_layernorm.py
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/core/test_layernorm.py
similarity index 100%
rename from tests/kernels/test_layernorm.py
rename to tests/kernels/core/test_layernorm.py
diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py
new file mode 100644
index 000000000000..c9a9679c5d80
--- /dev/null
+++ b/tests/kernels/core/test_opcheck.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tests for miscellaneous utilities
+"""
+
+import torch
+
+from tests.kernels.utils import opcheck
+
+
+def test_convert_fp8_opcheck():
+    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
+    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
+    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
+
+
+# TODO: Add this back, currently fails with
+# csrc/cuda_utils_kernels.cu:15 'invalid argument'
+# @pytest.mark.skipif(not current_platform.is_cuda(),
+#                     reason="Only supported for CUDA")
+# def test_cuda_utils_opcheck():
+#     opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
+#     opcheck(
+#         torch.ops._C_cuda_utils.
+#         get_max_shared_memory_per_block_device_attribute, (0, ))
diff --git a/tests/kernels/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py
similarity index 100%
rename from tests/kernels/test_permute_cols.py
rename to tests/kernels/core/test_permute_cols.py
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
similarity index 99%
rename from tests/kernels/test_pos_encoding.py
rename to tests/kernels/core/test_pos_encoding.py
index eb83b4d612c2..2b7bf755ec22 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -6,11 +6,10 @@
 import pytest
 import torch
 
+from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 
-from .allclose_default import get_default_atol, get_default_rtol
-
 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HEAD_SIZES = [64, 80, 112, 120, 256]
diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py
similarity index 100%
rename from tests/kernels/test_rotary_embedding.py
rename to tests/kernels/core/test_rotary_embedding.py
diff --git a/tests/kernels/test_uva.py b/tests/kernels/core/test_uva.py
similarity index 100%
rename from tests/kernels/test_uva.py
rename to tests/kernels/core/test_uva.py
diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py
similarity index 100%
rename from tests/kernels/test_causal_conv1d.py
rename to tests/kernels/mamba/test_causal_conv1d.py
diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
similarity index 100%
rename from tests/kernels/test_mamba_mixer2.py
rename to tests/kernels/mamba/test_mamba_mixer2.py
diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py
similarity index 100%
rename from tests/kernels/test_mamba_ssm.py
rename to tests/kernels/mamba/test_mamba_ssm.py
diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py
similarity index 100%
rename from tests/kernels/test_mamba_ssm_ssd.py
rename to tests/kernels/mamba/test_mamba_ssm_ssd.py
diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
similarity index 100%
rename from tests/kernels/test_cutlass_moe.py
rename to tests/kernels/moe/test_cutlass_moe.py
diff --git a/tests/kernels/test_moe.py b/tests/kernels/moe/test_moe.py
similarity index 100%
rename from tests/kernels/test_moe.py
rename to tests/kernels/moe/test_moe.py
diff --git a/tests/kernels/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
similarity index 100%
rename from tests/kernels/test_triton_moe_ptpc_fp8.py
rename to tests/kernels/moe/test_triton_moe_ptpc_fp8.py
diff --git a/tests/kernels/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py
similarity index 100%
rename from tests/kernels/test_allspark_gemm.py
rename to tests/kernels/quantization/test_allspark_gemm.py
diff --git a/tests/kernels/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py
similarity index 100%
rename from tests/kernels/test_aqlm.py
rename to tests/kernels/quantization/test_aqlm.py
diff --git a/tests/kernels/test_awq.py b/tests/kernels/quantization/test_awq.py
similarity index 100%
rename from tests/kernels/test_awq.py
rename to tests/kernels/quantization/test_awq.py
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/quantization/test_awq_marlin.py
similarity index 100%
rename from tests/kernels/test_awq_marlin.py
rename to tests/kernels/quantization/test_awq_marlin.py
diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py
similarity index 100%
rename from tests/kernels/test_awq_triton.py
rename to tests/kernels/quantization/test_awq_triton.py
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
similarity index 99%
rename from tests/kernels/test_block_fp8.py
rename to tests/kernels/quantization/test_block_fp8.py
index c450048bf665..da594675e924 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from tests.kernels.utils_block import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
@@ -18,8 +19,6 @@
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 
-from .utils_block import native_w8a8_block_matmul
-
 dg_available = False
 try:
     import deep_gemm
diff --git a/tests/kernels/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
similarity index 99%
rename from tests/kernels/test_block_int8.py
rename to tests/kernels/quantization/test_block_int8.py
index 9447f9d69165..943470ad113d 100644
--- a/tests/kernels/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 
+from tests.kernels.utils_block import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
@@ -13,8 +14,6 @@
     w8a8_block_int8_matmul)
 from vllm.platforms import current_platform
 
-from .utils_block import native_w8a8_block_matmul
-
 if current_platform.get_device_capability() < (7, 0):
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher",
                 allow_module_level=True)
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
similarity index 99%
rename from tests/kernels/test_cutlass_2of4_sparse.py
rename to tests/kernels/quantization/test_cutlass_2of4_sparse.py
index 2890e15d6cba..d67d2dbb8998 100644
--- a/tests/kernels/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -7,13 +7,12 @@
 import pytest
 import torch
 
+from tests.kernels.utils import baseline_scaled_mm, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
-from .utils import baseline_scaled_mm, to_fp8, to_int8
-
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
similarity index 99%
rename from tests/kernels/test_cutlass.py
rename to tests/kernels/quantization/test_cutlass_scaled_mm.py
index f11ce6f45a98..8084d9bf2c2d 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -8,13 +8,11 @@
 import pytest
 import torch
 
-from tests.kernels.utils import opcheck
+from tests.kernels.utils import baseline_scaled_mm, opcheck, to_fp8, to_int8
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
-from .utils import baseline_scaled_mm, to_fp8, to_int8
-
 MNK_FACTORS = [
     (1, 256, 128),
     (1, 16384, 1024),
diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py
similarity index 100%
rename from tests/kernels/test_fp8_quant.py
rename to tests/kernels/quantization/test_fp8_quant.py
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/quantization/test_ggml.py
similarity index 100%
rename from tests/kernels/test_ggml.py
rename to tests/kernels/quantization/test_ggml.py
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/quantization/test_gguf.py
similarity index 100%
rename from tests/kernels/test_gguf.py
rename to tests/kernels/quantization/test_gguf.py
diff --git a/tests/kernels/test_gptq.py b/tests/kernels/quantization/test_gptq.py
similarity index 100%
rename from tests/kernels/test_gptq.py
rename to tests/kernels/quantization/test_gptq.py
diff --git a/tests/kernels/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py
similarity index 100%
rename from tests/kernels/test_int8_kernel.py
rename to tests/kernels/quantization/test_int8_kernel.py
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py
similarity index 100%
rename from tests/kernels/test_int8_quant.py
rename to tests/kernels/quantization/test_int8_quant.py
diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
similarity index 100%
rename from tests/kernels/test_machete_mm.py
rename to tests/kernels/quantization/test_machete_mm.py
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
similarity index 100%
rename from tests/kernels/test_marlin_gemm.py
rename to tests/kernels/quantization/test_marlin_gemm.py
diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py
similarity index 100%
rename from tests/kernels/test_nvfp4_quant.py
rename to tests/kernels/quantization/test_nvfp4_quant.py
diff --git a/tests/kernels/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
similarity index 100%
rename from tests/kernels/test_nvfp4_scaled_mm.py
rename to tests/kernels/quantization/test_nvfp4_scaled_mm.py
diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
similarity index 100%
rename from tests/kernels/test_triton_scaled_mm.py
rename to tests/kernels/quantization/test_triton_scaled_mm.py
diff --git a/tests/kernels/test_utils.py b/tests/kernels/test_utils.py
deleted file mode 100644
index d3f032002651..000000000000
--- a/tests/kernels/test_utils.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Tests for miscellaneous utilities
-"""
-
-import pytest
-import torch
-
-from tests.kernels.utils import opcheck
-from vllm.platforms import current_platform
-
-
-def test_convert_fp8_opcheck():
-    data = torch.randn((256, 256), dtype=torch.float32, device="cuda")
-    result = torch.empty_like(data, dtype=torch.float8_e4m3fn)
-    opcheck(torch.ops._C_cache_ops.convert_fp8, (result, data, 1.0, "fp8"))
-
-
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="Only supported for CUDA")
-def test_cuda_utils_opcheck():
-    opcheck(torch.ops._C_cuda_utils.get_device_attribute, (0, 0))
-    opcheck(
-        torch.ops._C_cuda_utils.
-        get_max_shared_memory_per_block_device_attribute, (0, ))

From f7912cba3d613afa8b96ce2e04dad671205050c6 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 23 Apr 2025 22:32:16 +0800
Subject: [PATCH 20/81] [Doc] Add top anchor and a note to
 quantization/bitblas.md (#17042)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/source/features/quantization/bitblas.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/features/quantization/bitblas.md b/docs/source/features/quantization/bitblas.md
index aff917f90ec2..2901f760d3e4 100644
--- a/docs/source/features/quantization/bitblas.md
+++ b/docs/source/features/quantization/bitblas.md
@@ -1,7 +1,15 @@
+(bitblas)=
+
 # BitBLAS
 
 vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
 
+:::{note}
+Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
+Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
+For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
+:::
+
 Below are the steps to utilize BitBLAS with vLLM.
 
 ```console

From 53c0fa1e25a5d406d3011e760be068d7a8c102fb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 23 Apr 2025 15:32:26 +0100
Subject: [PATCH 21/81] Ensure that `pid` passed to `kill_process_tree` is
 `int` for `mypy` (#17051)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 32d8101f681d..9c0fa2d0773d 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -134,8 +134,8 @@ def shutdown(proc: Process, input_path: str, output_path: str):
         proc.terminate()
         proc.join(5)
 
-        if proc.is_alive():
-            kill_process_tree(proc.pid)
+        if proc.is_alive() and (pid := proc.pid) is not None:
+            kill_process_tree(pid)
 
     # Remove zmq ipc socket files.
     ipc_sockets = [output_path, input_path]

From af869f6dffec0cd99394bfc7449e28a67d2e9a5d Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 23 Apr 2025 10:33:14 -0400
Subject: [PATCH 22/81] [CI] Update structured-output label automation (#17055)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/mergify.yml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 3097b994659a..2033722b5f33 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -55,11 +55,19 @@ pull_request_rules:
   description: Automatically apply structured-output label
   conditions:
     - or:
+      - files~=^benchmarks/structured_schemas/
+      - files=benchmarks/benchmark_serving_structured_output.py
+      - files=benchmarks/run_structured_output_benchmark.sh
+      - files=docs/source/features/structured_outputs.md
+      - files=examples/offline_inference/structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
       - files~=^vllm/model_executor/guided_decoding/
       - files=tests/model_executor/test_guided_processors.py
       - files=tests/entrypoints/llm/test_guided_generate.py
-      - files=benchmarks/benchmark_serving_guided.py
-      - files=benchmarks/benchmark_guided.py
+      - files~=^tests/v1/structured_output/
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
+      - files~=^vllm/v1/structured_output/
   actions:
     label:
       add:

From 8e630d680e70fe7c7a1022ed33a09e6efaad629f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 23 Apr 2025 15:33:51 +0100
Subject: [PATCH 23/81] Improve Transformers backend model loading QoL (#17039)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/model_loader/utils.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 15f37aad6d8c..af4f2e95a9af 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -55,7 +55,10 @@ def resolve_transformers_arch(model_config: ModelConfig,
         #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
         # },
         auto_modules = {
-            name: get_class_from_dynamic_module(module, model_config.model)
+            name:
+            get_class_from_dynamic_module(module,
+                                          model_config.model,
+                                          revision=model_config.revision)
             for name, module in sorted(auto_map.items(), key=lambda x: x[0])
         }
         custom_model_module = auto_modules.get("AutoModel")
@@ -97,10 +100,10 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     vllm_supported_archs = ModelRegistry.get_supported_archs()
-    is_vllm_supported = any(arch in vllm_supported_archs
-                            for arch in architectures)
-    if (not is_vllm_supported
-            or model_config.model_impl == ModelImpl.TRANSFORMERS):
+    vllm_not_supported = not any(arch in vllm_supported_archs
+                                 for arch in architectures)
+    if (model_config.model_impl == ModelImpl.TRANSFORMERS or
+            model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
         architectures = resolve_transformers_arch(model_config, architectures)
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)

From f3a21e9c6835f04ecb5da1b953cb650986bf2209 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 23 Apr 2025 16:50:05 +0100
Subject: [PATCH 24/81] `CacheConfig.block_size` should always be `int` when
 used (#17052)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 641b221f5d34..f403654be165 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1261,11 +1261,14 @@ def is_matryoshka(self) -> bool:
 class CacheConfig:
     """Configuration for the KV cache."""
 
-    block_size: Optional[BlockSize] = None
+    block_size: BlockSize = None  # type: ignore
     """Size of a contiguous cache block in number of tokens. This is ignored on
     neuron devices and set to `--max-model-len`. On CUDA devices, only block
     sizes up to 32 are supported. On HPU devices, block size defaults to 128.
-    """
+
+    This config has no static default. If left unspecified by the user, it will
+    be set in `Platform.check_and_update_configs()` based on the current
+    platform."""
     gpu_memory_utilization: float = 0.9
     """The fraction of GPU memory to be used for the model executor, which can
     range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory

From bdb366031252cd8d99270a3f81dfe76021a760c1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 23 Apr 2025 16:50:08 +0100
Subject: [PATCH 25/81] Use `@property` and private field for
 `data_parallel_rank_local` (#17053)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py                | 17 +++++++++++++++--
 vllm/v1/engine/core_client.py |  6 +++---
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index f403654be165..709983d05a50 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1593,8 +1593,21 @@ class ParallelConfig:
     the product of the tensor parallel size and data parallel size."""
     data_parallel_rank: int = 0
     """Rank of the data parallel group."""
-    data_parallel_rank_local: Optional[int] = None
-    """Local rank of the data parallel group, defaults to global rank."""
+    _data_parallel_rank_local: Optional[int] = field(default=None, init=False)
+    """Private field to store the local rank of the data parallel group."""
+
+    @property
+    def data_parallel_rank_local(self) -> int:
+        """Local rank of the data parallel group, defaults to global rank."""
+        if self._data_parallel_rank_local is None:
+            return self.data_parallel_rank
+        return self._data_parallel_rank_local
+
+    @data_parallel_rank_local.setter
+    def data_parallel_rank_local(self, value: int) -> None:
+        """Set the local rank of the data parallel group."""
+        self._data_parallel_rank_local = value
+
     data_parallel_master_ip: str = "127.0.0.1"
     """IP of the data parallel master."""
     data_parallel_master_port: int = 29500
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index a2727d995e7d..21ad446172a3 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -439,10 +439,10 @@ def _init_core_engines(
     ) -> None:
 
         # Default case - single core engine.
-        dp_rank = vllm_config.parallel_config.data_parallel_rank
-        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
         core_engine = new_core_engine(
-            dp_rank, local_dp_rank if local_dp_rank is not None else dp_rank)
+            vllm_config.parallel_config.data_parallel_rank,
+            vllm_config.parallel_config.data_parallel_rank_local,
+        )
         core_engines.append(core_engine)
         self.core_engine = core_engine
 

From 3cde34a4a4bceb511f2f9fe2dade1da116eb7e8a Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Wed, 23 Apr 2025 12:34:41 -0600
Subject: [PATCH 26/81] [Frontend] Support guidance:no-additional-properties
 for compatibility with xgrammar (#15949)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 tests/entrypoints/llm/test_guided_generate.py | 60 ++++++++++++++++++-
 .../llm/test_struct_output_generate.py        | 56 +++++++++++++++++
 vllm/config.py                                |  2 +-
 vllm/engine/arg_utils.py                      | 18 +++---
 .../guided_decoding/guidance_decoding.py      | 15 ++++-
 vllm/v1/engine/processor.py                   |  8 ---
 vllm/v1/structured_output/backend_guidance.py | 60 ++++++++++++++++---
 vllm/v1/structured_output/backend_xgrammar.py | 16 +++--
 8 files changed, 201 insertions(+), 34 deletions(-)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index e43e9826e8f9..6bc32ebadc7a 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -383,4 +383,62 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
         assert generated_text is not None
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=json_schema)
\ No newline at end of file
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guidance_no_additional_properties(llm):
+    schema = {
+        'type': 'object',
+        'properties': {
+            'a1': {
+                'type': 'string'
+            },
+            'a2': {
+                'type': 'string'
+            },
+            'a3': {
+                'type': 'string'
+            }
+        },
+        'required': ['a1', 'a2', 'a3'],
+    }
+
+    prompt = (
+        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
+        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
+        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
+        "<|im_end|>\n<|im_start|>assistant\n")
+
+    def generate_with_backend(backend):
+        guided_params = GuidedDecodingParams(json=schema, backend=backend)
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=256,
+                                         guided_decoding=guided_params)
+
+        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        assert outputs is not None
+        generated_text = outputs[0].outputs[0].text
+        assert generated_text is not None
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
+        return parsed_json
+
+    base_generated = generate_with_backend('guidance:disable-any-whitespace')
+    assert "a1" in base_generated
+    assert "a2" in base_generated
+    assert "a3" in base_generated
+    # by default additional keys are generated
+    assert "a4" in base_generated
+    assert "a5" in base_generated
+    assert "a6" in base_generated
+
+    generated = generate_with_backend(
+        'guidance:no-additional-properties,disable-any-whitespace')
+    assert "a1" in generated
+    assert "a2" in generated
+    assert "a3" in generated
+    assert "a4" not in generated
+    assert "a5" not in generated
+    assert "a6" not in generated
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index c243d81e7f18..fc8e271f7f91 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -412,3 +412,59 @@ def test_structured_output_auto_mode(
         # Parse to verify it is valid JSON
         parsed_json = json.loads(generated_text)
         assert isinstance(parsed_json, dict)
+
+
+@pytest.mark.skip_global_cleanup
+def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    backend = 'guidance:no-additional-properties,disable-any-whitespace'
+    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+              max_model_len=1024,
+              guided_decoding_backend=backend)
+
+    schema = {
+        'type': 'object',
+        'properties': {
+            'a1': {
+                'type': 'string'
+            },
+            'a2': {
+                'type': 'string'
+            },
+            'a3': {
+                'type': 'string'
+            }
+        },
+        'required': ['a1', 'a2', 'a3'],
+    }
+
+    prompt = (
+        "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a "
+        "helpful assistant.<|im_end|>\n<|im_start|>user\nPlease generate a "
+        "large JSON object with key-value pairs a1=b1, a2=b2, ..., a20=b20"
+        "<|im_end|>\n<|im_start|>assistant\n")
+
+    def generate_with_backend(backend):
+        guided_params = GuidedDecodingParams(json=schema, backend=backend)
+        sampling_params = SamplingParams(temperature=0,
+                                         max_tokens=256,
+                                         guided_decoding=guided_params)
+
+        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
+        assert outputs is not None
+        generated_text = outputs[0].outputs[0].text
+        assert generated_text is not None
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
+        return parsed_json
+
+    generated = generate_with_backend(
+        'guidance:no-additional-properties,disable-any-whitespace')
+    assert "a1" in generated
+    assert "a2" in generated
+    assert "a3" in generated
+    assert "a4" not in generated
+    assert "a5" not in generated
+    assert "a6" not in generated
diff --git a/vllm/config.py b/vllm/config.py
index 709983d05a50..741ce04d5dff 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3107,7 +3107,7 @@ def get_served_model_name(model: str,
 
 
 GuidedDecodingBackendV0 = Literal["auto", "outlines", "lm-format-enforcer",
-                                  "xgrammar"]
+                                  "xgrammar", "guidance"]
 GuidedDecodingBackendV1 = Literal["auto", "xgrammar", "guidance"]
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b6d0bfeac4a4..c67759c1d09a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -18,7 +18,8 @@
 from vllm import version
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                          ConfigFormat, ConfigType, DecodingConfig, Device,
-                         DeviceConfig, DistributedExecutorBackend, HfOverrides,
+                         DeviceConfig, DistributedExecutorBackend,
+                         GuidedDecodingBackendV1, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
                          ModelConfig, ModelImpl, MultiModalConfig,
                          ObservabilityConfig, ParallelConfig, PoolerConfig,
@@ -1370,14 +1371,13 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=True)
             return False
 
-        # Xgrammar and Guidance are supported.
-        SUPPORTED_GUIDED_DECODING = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
-            "guidance:disable-any-whitespace", "auto"
-        ]
-        if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
-            _raise_or_fallback(feature_name="--guided-decoding-backend",
-                               recommend_to_remove=False)
+        # remove backend options when doing this check
+        if self.guided_decoding_backend.split(':')[0] \
+            not in get_args(GuidedDecodingBackendV1):
+            _raise_or_fallback(
+                feature_name=
+                f"--guided-decoding-backend={self.guided_decoding_backend}",
+                recommend_to_remove=False)
             return False
 
         # Need at least Ampere for now (FA support required).
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index f19ebcbe420e..95b7c71107aa 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import json
 from re import escape as regex_escape
 
 import llguidance
@@ -7,6 +8,8 @@
 from vllm.model_executor.guided_decoding.guidance_logits_processors import (
     GuidanceLogitsProcessor)
 from vllm.sampling_params import GuidedDecodingParams
+from vllm.v1.structured_output.backend_guidance import (
+    process_for_additional_properties)
 
 
 def get_local_guidance_guided_decoding_logits_processor(
@@ -20,9 +23,17 @@ def get_local_guidance_guided_decoding_logits_processor(
     grm = ""
     any_whitespace = 'disable-any-whitespace' not in \
         guided_params.backend_options()
-    if guided_params.json:
+    if (guide_json := guided_params.json) is not None:
+        # Optionally set additionalProperties to False at the top-level
+        # By default, other backends do not allow additional top-level
+        # properties, so this makes guidance more similar to other backends
+        if 'no-additional-properties' in guided_params.backend_options():
+            if not isinstance(guide_json, str):
+                guide_json = json.dumps(guide_json)
+            guide_json = process_for_additional_properties(guide_json)
+
         grm = llguidance.LLMatcher.grammar_from_json_schema(
-            guided_params.json,
+            guide_json,
             overrides={"whitespace_pattern": guided_params.whitespace_pattern},
             defaults={
                 "whitespace_flexible": any_whitespace,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 26c57b31aacd..7e6b7ba47035 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -145,15 +145,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.guided_decoding or not self.decoding_config:
             return
 
-        supported_backends = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
-            "guidance:disable-any-whitespace", "auto"
-        ]
-
         engine_level_backend = self.decoding_config.guided_decoding_backend
-        if engine_level_backend not in supported_backends:
-            raise ValueError(f"Only {supported_backends} structured output is "
-                             "supported in V1.")
         if params.guided_decoding.backend:
             # Request-level backend selection is not supported in V1.
             # The values may differ if `params` is reused and was set
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 9150a28570bd..0edb15558dce 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -1,14 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
+import json
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import torch
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
@@ -29,6 +31,29 @@
 logger = init_logger(__name__)
 
 
+def _walk_json_for_additional_properties(data: object):
+    if isinstance(data, dict):
+        for value in data.values():
+            _walk_json_for_additional_properties(value)
+        if 'additionalProperties' not in data and \
+            ('properties' in data or 'patternProperties' in data):
+            data['additionalProperties'] = False
+    elif isinstance(data, list):
+        for item in data:
+            _walk_json_for_additional_properties(item)
+
+
+def process_for_additional_properties(
+        guide_json: Union[str, dict[str, Any]]) -> dict[str, Any]:
+    if isinstance(guide_json, str):
+        guide_json_obj = json.loads(guide_json)
+    else:
+        # copy for modifications
+        guide_json_obj = copy.deepcopy(guide_json)
+    _walk_json_for_additional_properties(guide_json_obj)
+    return guide_json_obj
+
+
 class GuidanceBackend(StructuredOutputBackend):
 
     def __init__(self, vllm_config: VllmConfig):
@@ -41,9 +66,20 @@ def __init__(self, vllm_config: VllmConfig):
         tokenizer_group.ping()
         self.vllm_config = vllm_config
         self.vocab_size = vllm_config.model_config.get_vocab_size()
-        self.disable_any_whitespace = (
-            "disable-any-whitespace"
-            in vllm_config.decoding_config.guided_decoding_backend)
+
+        self.disable_any_whitespace = False
+        self.no_additional_properties = False
+        backend_options = GuidedDecodingParams(
+            backend=vllm_config.decoding_config.guided_decoding_backend
+        ).backend_options()
+        for option in backend_options:
+            if option == "disable-any-whitespace":
+                self.disable_any_whitespace = True
+            elif option == "no-additional-properties":
+                self.no_additional_properties = True
+            else:
+                raise ValueError(
+                    f"Unsupported option for the guidance backend: {option}")
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.ll_tokenizer = llguidance_hf.from_tokenizer(
@@ -52,7 +88,8 @@ def __init__(self, vllm_config: VllmConfig):
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
         self.serialized_grammar = serialize_guidance_grammar(
-            request_type, grammar_spec, self.disable_any_whitespace)
+            request_type, grammar_spec, self.disable_any_whitespace,
+            self.no_additional_properties)
 
         ll_matcher = llguidance.LLMatcher(
             self.ll_tokenizer,
@@ -129,10 +166,15 @@ def reset(self):
         self.ll_matcher.reset()
 
 
-def serialize_guidance_grammar(request_type: StructuredOutputOptions,
-                               grammar_spec: str,
-                               disable_any_whitespace: bool = False) -> str:
+def serialize_guidance_grammar(
+    request_type: StructuredOutputOptions,
+    grammar_spec: Union[str, dict[str, Any]],
+    disable_any_whitespace: bool = False,
+    no_additional_properties: bool = False,
+) -> str:
     if request_type == StructuredOutputOptions.JSON:
+        if no_additional_properties:
+            grammar_spec = process_for_additional_properties(grammar_spec)
         return llguidance.LLMatcher.grammar_from_json_schema(
             grammar_spec,
             defaults={
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index c9839bd7ddee..1e4470153e30 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -9,7 +9,7 @@
 import vllm.envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import LazyLoader
@@ -32,9 +32,6 @@ class XgrammarBackend(StructuredOutputBackend):
 
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
-        self.disable_any_whitespace = (
-            "disable-any-whitespace"
-            in vllm_config.decoding_config.guided_decoding_backend)
         tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
@@ -42,6 +39,17 @@ def __init__(self, vllm_config: VllmConfig):
             lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
         tokenizer_group.ping()
 
+        self.disable_any_whitespace = False
+        backend_options = GuidedDecodingParams(
+            backend=vllm_config.decoding_config.guided_decoding_backend
+        ).backend_options()
+        for option in backend_options:
+            if option == "disable-any-whitespace":
+                self.disable_any_whitespace = True
+            else:
+                raise ValueError(
+                    f"Unsupported option for the xgrammar backend: {option}")
+
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.vocab_size = vllm_config.model_config.get_vocab_size()
         if isinstance(tokenizer, MistralTokenizer):

From 32d4b669d00d50ad084af128b9c01a5583fb2b7d Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Wed, 23 Apr 2025 12:12:35 -0700
Subject: [PATCH 27/81] [BugFix][V1] Fix int32 token index overflow when
 preparing input ids (#16806)

---
 vllm/v1/worker/gpu_model_runner.py | 3 ++-
 vllm/v1/worker/tpu_model_runner.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bdf0d0f72289..4ecf72b56ef6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -241,10 +241,11 @@ def __init__(
             device=self.device)
 
         # OPTIMIZATION: Cache the tensors rather than creating them every step.
+        # Keep in int64 to avoid overflow with long context
         self.arange_np = np.arange(max(self.max_num_reqs + 1,
                                        self.max_model_len,
                                        self.max_num_tokens),
-                                   dtype=np.int32)
+                                   dtype=np.int64)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
         # not make any assumptions about the values in these tensors.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 5d94f675f92e..e9cb0dbe8b5e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -219,7 +219,8 @@ def __init__(
 
         # Range tensor with values [0 .. self.max_num_tokens - 1].
         # Used to initialize positions / context_lens / seq_lens
-        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
+        # Keep in int64 to avoid overflow with long context
+        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int64)
         self.num_reqs_paddings = _get_req_paddings(
             min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs)
 

From 41fb013d299941486cb026a0e607d2f77a7ec1e8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 23 Apr 2025 14:57:43 -0700
Subject: [PATCH 28/81] [V1][Spec Decode] Always use argmax for sampling draft
 tokens  (#16899)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/sample/rejection_sampler.py | 14 +++++++-------
 vllm/v1/spec_decode/eagle.py        | 22 ++++++++++------------
 vllm/v1/worker/gpu_model_runner.py  |  5 +----
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 3cf7fde5cd0e..9061a64db57c 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -226,7 +226,7 @@ def rejection_sample(
         is_greedy,
         max_spec_len,
         vocab_size,
-        IS_NGRAM=draft_probs is None,
+        NO_DRAFT_PROBS=draft_probs is None,
         num_warps=1,
     )
     return output_token_ids
@@ -423,7 +423,7 @@ def sample_recovered_tokens(
         q,
         vocab_size,
         triton.next_power_of_2(vocab_size),
-        IS_NGRAM=draft_probs is None,
+        NO_DRAFT_PROBS=draft_probs is None,
     )
     return recovered_token_ids
 
@@ -490,7 +490,7 @@ def rejection_random_sample_kernel(
     is_greedy_ptr,  # [batch_size]
     max_spec_len,
     vocab_size,
-    IS_NGRAM: tl.constexpr,
+    NO_DRAFT_PROBS: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
     is_greedy = tl.load(is_greedy_ptr + req_idx)
@@ -509,7 +509,7 @@ def rejection_random_sample_kernel(
     for pos in range(num_draft_tokens):
         if not rejected:
             draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
-            if IS_NGRAM:
+            if NO_DRAFT_PROBS:
                 draft_prob = 1
             else:
                 draft_prob = tl.load(draft_probs_ptr +
@@ -575,7 +575,7 @@ def sample_recovered_tokens_kernel(
     q_ptr,  # [batch_size, vocab_size]
     vocab_size,
     PADDED_VOCAB_SIZE: tl.constexpr,
-    IS_NGRAM: tl.constexpr,
+    NO_DRAFT_PROBS: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
     if req_idx == 0:
@@ -591,7 +591,7 @@ def sample_recovered_tokens_kernel(
         return
 
     vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE)
-    if IS_NGRAM:
+    if NO_DRAFT_PROBS:
         draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
         orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
                             draft_token_id)
@@ -624,7 +624,7 @@ def sample_recovered_tokens_kernel(
     recovered_id = tl.argmax(prob / q, axis=-1)
     tl.store(output_token_ids_ptr + start_idx + pos, recovered_id)
 
-    if IS_NGRAM:
+    if NO_DRAFT_PROBS:
         # Restore the original probability.
         tl.store(
             target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 3efafa8f0b1f..95f0c067d406 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -51,7 +51,7 @@ def propose(
         # [batch_size, max_num_blocks_per_req]
         block_table: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
         last_token_indices = cu_num_tokens[1:] - 1
@@ -94,17 +94,15 @@ def propose(
             )
         sample_hidden_states = hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
-        draft_token_ids, draft_probs = compute_probs_and_sample_next_token(
-            logits, sampling_metadata)
+        draft_token_ids = logits.argmax(dim=-1)
 
         # Early exit if there is only one draft token to be generated.
         if self.num_speculative_tokens == 1:
-            # [batch_size, 1] and [batch_size, 1, vocab_size]
-            return draft_token_ids.view(-1, 1), draft_probs.unsqueeze(dim=1)
+            # [batch_size, 1]
+            return draft_token_ids.view(-1, 1)
 
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
-        draft_probs_list = [draft_probs]
 
         positions = target_positions[last_token_indices]
         hidden_states = sample_hidden_states
@@ -159,16 +157,12 @@ def propose(
                     positions=clamped_positions,
                 )
             logits = self.model.compute_logits(hidden_states, None)
-            draft_token_ids, probs = compute_probs_and_sample_next_token(
-                logits, sampling_metadata)
+            draft_token_ids = logits.argmax(dim=-1)
             draft_token_ids_list.append(draft_token_ids)
-            draft_probs_list.append(probs)
 
         # [batch_size, num_speculative_tokens]
         draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
-        # [batch_size, num_speculative_tokens, vocab_size]
-        draft_probs = torch.stack(draft_probs_list, dim=1)
-        return draft_token_ids, draft_probs
+        return draft_token_ids
 
     @staticmethod
     def prepare_inputs(
@@ -238,6 +232,10 @@ def load_model(self, target_model: nn.Module) -> None:
         self.model.lm_head = target_model.lm_head
 
 
+# NOTE(woosuk): Currently, the below code is not used and we always use argmax
+# to sample the draft tokens. We will use this after we find a way to manage
+# the draft prob tensor.
+# Refer to https://github.com/vllm-project/vllm/pull/16899 for the details.
 # FIXME(woosuk): The logic here is duplicated with the main sampling code.
 # We should refactor this to reuse the same sampling implementation.
 def compute_probs_and_sample_next_token(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4ecf72b56ef6..c12e4fd555d2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1230,7 +1230,7 @@ def execute_model(
                 target_hidden_states = hidden_states[token_indices]
                 target_slot_mapping = attn_metadata.slot_mapping[token_indices]
 
-            draft_token_ids, draft_probs = self.drafter.propose(
+            draft_token_ids = self.drafter.propose(
                 target_token_ids=target_token_ids,
                 target_positions=target_positions,
                 target_hidden_states=target_hidden_states,
@@ -1241,9 +1241,6 @@ def execute_model(
                 sampling_metadata=sampling_metadata,
             )
             spec_token_ids = draft_token_ids.tolist()
-            # TODO(woosuk): Cache draft_probs and use it for rejection sampling
-            # in the next step.
-            del draft_probs
 
         # Clear KVConnector state after all KVs are generated.
         if has_kv_transfer_group():

From b07d741661570ef199ba92528b11c0bd1294fb15 Mon Sep 17 00:00:00 2001
From: Sangyeon Cho <josang1204@gmail.com>
Date: Thu, 24 Apr 2025 08:14:18 +0900
Subject: [PATCH 29/81] [CI/Build] workaround for CI build failure (#17070)

Signed-off-by: csy1204 <josang1204@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 docker/Dockerfile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index e8e18df1bb49..80440901c07a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -162,6 +162,9 @@ ENV UV_HTTP_TIMEOUT=500
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/dev.txt
 #################### DEV IMAGE ####################
@@ -265,6 +268,9 @@ ADD . /vllm-workspace/
 ENV UV_HTTP_TIMEOUT=500
 
 # install development dependencies (for testing)
+# Workaround for #17068
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/dev.txt
 

From 6b2427f995a81377e5758b34c8bb4c66db2f67bc Mon Sep 17 00:00:00 2001
From: Chen Xia <cxia0209@gmail.com>
Date: Wed, 23 Apr 2025 17:32:40 -0700
Subject: [PATCH 30/81] [Quantization]add prefix for commandA quantized model
 (#17017)

---
 vllm/model_executor/models/commandr.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index bb8d9bf8a03c..8912affe36fe 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -89,6 +89,7 @@ def __init__(
         self,
         config: CohereConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -99,12 +100,14 @@ def __init__(
             [self.intermediate_size] * 2,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
         )
         self.down_proj = RowParallelLinear(
             self.intermediate_size,
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
         )
         self.act_fn = SiluAndMul()
 
@@ -158,12 +161,14 @@ def __init__(
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             self.hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -244,7 +249,9 @@ def __init__(self,
                                          quant_config=quant_config,
                                          prefix=f"{prefix}.self_attn")
 
-        self.mlp = CohereMLP(config, quant_config=quant_config)
+        self.mlp = CohereMLP(config,
+                             quant_config=quant_config,
+                             prefix=f"{prefix}.mlp")
         self.input_layernorm = LayerNorm(param_shape=(config.hidden_size),
                                          eps=config.layer_norm_eps)
 

From 46e678bcff350518bee8b92c826f21c5cdd0d08f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 23 Apr 2025 19:18:59 -0700
Subject: [PATCH 31/81] [Minor] Use larger batch sizes for
 A100/B100/B200/MI300x (#17073)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/engine/arg_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c67759c1d09a..00328f56b713 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -35,7 +35,7 @@
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, is_in_ray_actor
+from vllm.utils import FlexibleArgumentParser, GiB_bytes, is_in_ray_actor
 
 # yapf: enable
 
@@ -1625,13 +1625,13 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
         # values for non-H100/H200 GPUs.
         try:
             from vllm.platforms import current_platform
-            device_name = current_platform.get_device_name().lower()
+            device_memory = current_platform.get_device_total_memory()
         except Exception:
             # This is only used to set default_max_num_batched_tokens
-            device_name = "no-device"
+            device_memory = 0
 
-        if "h100" in device_name or "h200" in device_name:
-            # For H100 and H200, we use larger default values.
+        if device_memory >= 70 * GiB_bytes:
+            # For GPUs like H100 and MI300x, use larger default values.
             default_max_num_batched_tokens = {
                 UsageContext.LLM_CLASS: 16384,
                 UsageContext.OPENAI_API_SERVER: 8192,

From ed50f46641d5960f9987a3aae1d8fbc8fd046920 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 23 Apr 2025 20:54:00 -0600
Subject: [PATCH 32/81] [Bugfix] Enable V1 usage stats (#16986)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
---
 vllm/usage/usage_lib.py      |  8 +++----
 vllm/utils.py                | 20 +++++++++++++++-
 vllm/v1/engine/async_llm.py  |  4 ++++
 vllm/v1/engine/llm_engine.py |  4 ++++
 vllm/v1/utils.py             | 44 ++++++++++++++++++++++++++++++++++++
 5 files changed, 75 insertions(+), 5 deletions(-)

diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 2ee3f9104d19..7132681050e1 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -19,6 +19,7 @@
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
+from vllm.utils import cuda_device_count_stateless, cuda_get_device_properties
 from vllm.version import __version__ as VLLM_VERSION
 
 _config_home = envs.VLLM_CONFIG_ROOT
@@ -168,10 +169,9 @@ def _report_usage_once(self, model_architecture: str,
         # Platform information
         from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
-            device_property = torch.cuda.get_device_properties(0)
-            self.gpu_count = torch.cuda.device_count()
-            self.gpu_type = device_property.name
-            self.gpu_memory_per_device = device_property.total_memory
+            self.gpu_count = cuda_device_count_stateless()
+            self.gpu_type, self.gpu_memory_per_device = (
+                cuda_get_device_properties(0, ("name", "total_memory")))
         if current_platform.is_cuda():
             self.cuda_runtime = torch.version.cuda
         self.provider = _detect_cloud_provider()
diff --git a/vllm/utils.py b/vllm/utils.py
index c65a370bd530..ed406a6b7b11 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -38,11 +38,13 @@
 from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
                              Iterable, Iterator, KeysView, Mapping)
+from concurrent.futures.process import ProcessPoolExecutor
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Tuple, Type, TypeVar, Union, cast, overload)
+                    Optional, Sequence, Tuple, Type, TypeVar, Union, cast,
+                    overload)
 from uuid import uuid4
 
 import cachetools
@@ -1235,6 +1237,22 @@ def cuda_is_initialized() -> bool:
     return torch.cuda.is_initialized()
 
 
+def cuda_get_device_properties(device,
+                               names: Sequence[str],
+                               init_cuda=False) -> tuple[Any, ...]:
+    """Get specified CUDA device property values without initializing CUDA in
+    the current process."""
+    if init_cuda or cuda_is_initialized():
+        props = torch.cuda.get_device_properties(device)
+        return tuple(getattr(props, name) for name in names)
+
+    # Run in subprocess to avoid initializing CUDA as a side effect.
+    mp_ctx = multiprocessing.get_context("fork")
+    with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
+        return executor.submit(cuda_get_device_properties, device, names,
+                               True).result()
+
+
 def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]:
     """Make an instance method that weakly references
     its associated instance and no-ops once that
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index bc49a0d3bb5b..1149dfa9ce5c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -36,6 +36,7 @@
 from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
                                      StatLoggerBase)
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.utils import report_usage_stats
 
 logger = init_logger(__name__)
 
@@ -114,6 +115,9 @@ def __init__(
         except RuntimeError:
             pass
 
+        # If usage stat is enabled, collect relevant info.
+        report_usage_stats(vllm_config, usage_context)
+
     @classmethod
     def from_vllm_config(
         cls,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c05319f3d80c..6fa90b269825 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -28,6 +28,7 @@
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.utils import report_usage_stats
 
 logger = init_logger(__name__)
 
@@ -99,6 +100,9 @@ def __init__(
             # for v0 compatibility
             self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
 
+        # If usage stat is enabled, collect relevant info.
+        report_usage_stats(vllm_config, usage_context)
+
     @classmethod
     def from_vllm_config(
         cls,
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 9c0fa2d0773d..dc6457bf9032 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -12,6 +12,8 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
+                                  usage_message)
 from vllm.utils import get_mp_context, kill_process_tree
 
 if TYPE_CHECKING:
@@ -201,3 +203,45 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
     Returns the sliced target tensor.
     """
     return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
+
+
+def report_usage_stats(vllm_config, usage_context: UsageContext) -> None:
+    """Report usage statistics if enabled."""
+
+    if not is_usage_stats_enabled():
+        return
+
+    from vllm.model_executor.model_loader import get_architecture_class_name
+
+    usage_message.report_usage(
+        get_architecture_class_name(vllm_config.model_config),
+        usage_context,
+        extra_kvs={
+            # Common configuration
+            "dtype":
+            str(vllm_config.model_config.dtype),
+            "tensor_parallel_size":
+            vllm_config.parallel_config.tensor_parallel_size,
+            "block_size":
+            vllm_config.cache_config.block_size,
+            "gpu_memory_utilization":
+            vllm_config.cache_config.gpu_memory_utilization,
+
+            # Quantization
+            "quantization":
+            vllm_config.model_config.quantization,
+            "kv_cache_dtype":
+            str(vllm_config.cache_config.cache_dtype),
+
+            # Feature flags
+            "enable_lora":
+            bool(vllm_config.lora_config),
+            "enable_prompt_adapter":
+            bool(vllm_config.prompt_adapter_config),
+            "enable_prefix_caching":
+            vllm_config.cache_config.enable_prefix_caching,
+            "enforce_eager":
+            vllm_config.model_config.enforce_eager,
+            "disable_custom_all_reduce":
+            vllm_config.parallel_config.disable_custom_all_reduce,
+        })

From 2c8ed8ee485c13d88d7aa4fc4b8e1ac4b4301a78 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 24 Apr 2025 03:54:03 +0100
Subject: [PATCH 33/81] More informative error when using Transformers backend
 (#16988)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/models/supported_models.md    | 48 ++++++++++++-----------
 vllm/model_executor/model_loader/utils.py | 24 ++++++------
 2 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 0fdffbeefd03..0fad1b1ef276 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -40,33 +40,37 @@ You can force the use of `TransformersForCausalLM` by setting `model_impl="trans
 vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
 :::
 
-#### Supported features
+#### Custom models
 
-The Transformers modeling backend explicitly supports the following features:
+If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
 
-- <project:#quantization-index> (except GGUF)
-- <project:#lora-adapter>
-- <project:#distributed-serving>
+For a model to be compatible with the Transformers backend for vLLM it must:
 
-#### Remote Code
+- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
+  * The model directory must have the correct structure (e.g. `config.json` is present).
+  * `config.json` must contain `auto_map.AutoModel`.
+- be a Transformers backend for vLLM compatible model (see <project:#writing-custom-models>):
+  * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
 
-If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
+If the compatible model is:
 
-Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
-Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
+- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for <project:#offline-inference> or `--trust-remode-code` for the <project:#openai-compatible-server>.
+- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for <project:#offline-inference> or `vllm serve <MODEL_DIR>` for the <project:#openai-compatible-server>.
 
-:::{tip}
-If you have not yet created your custom model, you can follow this guide on [customising models in Transformers](https://huggingface.co/docs/transformers/en/custom_models).
-:::
+This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
 
-```python
-from vllm import LLM
-llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
-llm.apply_model(lambda model: print(model.__class__))
-```
+(writing-custom-models)=
+
+#### Writing custom models
+
+This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
 
 To make your model compatible with the Transformers backend, it needs:
 
+1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
+2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
+3. `MyModel` must contain `_supports_attention_backend = True`.
+
 ```{code-block} python
 :caption: modeling_my_model.py
 
@@ -75,7 +79,7 @@ from torch import nn
 
 class MyAttention(nn.Module):
 
-  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+  def forward(self, hidden_states, **kwargs):
     ...
     attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
     attn_output, attn_weights = attention_interface(
@@ -91,11 +95,11 @@ class MyModel(PreTrainedModel):
   _supports_attention_backend = True
 ```
 
-Here is what happens in the background:
+Here is what happens in the background when this model is loaded:
 
-1. The config is loaded
-2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
-3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+1. The config is loaded.
+2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
+3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
 
 That's it!
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index af4f2e95a9af..0ca6b6fd88b6 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -30,15 +30,6 @@ def set_default_torch_dtype(dtype: torch.dtype):
     torch.set_default_dtype(old_dtype)
 
 
-def is_transformers_impl_compatible(
-        arch: str,
-        module: Optional["transformers.PreTrainedModel"] = None) -> bool:
-    mod = module or getattr(transformers, arch, None)
-    if mod is None:
-        return False
-    return mod.is_backend_compatible()
-
-
 def resolve_transformers_arch(model_config: ModelConfig,
                               architectures: list[str]):
     for i, arch in enumerate(architectures):
@@ -61,17 +52,26 @@ def resolve_transformers_arch(model_config: ModelConfig,
                                           revision=model_config.revision)
             for name, module in sorted(auto_map.items(), key=lambda x: x[0])
         }
-        custom_model_module = auto_modules.get("AutoModel")
+        model_module = getattr(transformers, arch, None)
+        if model_module is None:
+            if "AutoModel" not in auto_map:
+                raise ValueError(
+                    f"Cannot find model module. '{arch}' is not a registered "
+                    "model in the Transformers library (only relevant if the "
+                    "model is meant to be in Transformers) and 'AutoModel' is "
+                    "not present in the model config's 'auto_map' (relevant "
+                    "if the model is custom).")
+            model_module = auto_modules["AutoModel"]
         # TODO(Isotr0py): Further clean up these raises.
         # perhaps handled them in _ModelRegistry._raise_for_unsupported?
         if model_config.model_impl == ModelImpl.TRANSFORMERS:
-            if not is_transformers_impl_compatible(arch, custom_model_module):
+            if not model_module.is_backend_compatible():
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
             architectures[i] = "TransformersForCausalLM"
         if model_config.model_impl == ModelImpl.AUTO:
-            if not is_transformers_impl_compatible(arch, custom_model_module):
+            if not model_module.is_backend_compatible():
                 raise ValueError(
                     f"{arch} has no vLLM implementation and the Transformers "
                     "implementation is not compatible with vLLM. Try setting "

From ed2e464653ab552c54c6da5c2b69d31bd61ba765 Mon Sep 17 00:00:00 2001
From: Areeb Syed <areebsyed237@gmail.com>
Date: Thu, 24 Apr 2025 08:25:00 +0530
Subject: [PATCH 34/81] Addendum Fix to support FIPS enabled machines with MD5
 hashing (#17043)

Signed-off-by: sydarb <areebsyed237@gmail.com>
---
 .../kv_transfer/kv_connector/v1/shared_storage_connector.py    | 3 ++-
 vllm/envs.py                                                   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 1d2040784e6c..f91ffbc720e7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -357,7 +357,8 @@ def _generate_foldername_debug(
         ids.
         """
         input_ids_bytes = input_ids.numpy().tobytes()
-        input_ids_hash = hashlib.md5(input_ids_bytes).hexdigest()
+        input_ids_hash = hashlib.md5(input_ids_bytes,
+                                     usedforsecurity=False).hexdigest()
         foldername = os.path.join(self._storage_path, input_ids_hash)
         if create_folder:
             os.makedirs(foldername, exist_ok=True)
diff --git a/vllm/envs.py b/vllm/envs.py
index 03a8a2b20f02..ea40bfff11b5 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -794,6 +794,7 @@ def factorize(name: str):
         if key in environment_variables:
             factorize(key)
 
-    hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+    hash_str = hashlib.md5(str(factors).encode(),
+                           usedforsecurity=False).hexdigest()
 
     return hash_str

From 6167c0e5d2a18fb4c4d14be3b4e9f5e35b712b76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=AE=87?= <zhangyuygss@outlook.com>
Date: Thu, 24 Apr 2025 11:25:37 +0800
Subject: [PATCH 35/81] =?UTF-8?q?[Bugfix][Core]=20add=20seq=5Fid=5Fto=5Fse?=
 =?UTF-8?q?q=5Fgroup=20clearing=20to=20avoid=20memory=20leak=20when=20s?=
 =?UTF-8?q?=E2=80=A6=20(#16472)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 开哲 <kaizhe.zy@alibaba-inc.com>
Co-authored-by: 开哲 <kaizhe.zy@alibaba-inc.com>
---
 vllm/outputs.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index c8b9be5424e4..65a6ed01451d 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -179,6 +179,13 @@ def from_seq_group(
                 group.finish_seq(seq_group)
             if assembled_seq_group is None:
                 return None
+
+            # clear finished seq in seq_id_to_seq_group
+            if len(group.to_be_finished) == 0:
+                for sub_request_id in list(group.seq_id_to_index.keys()):
+                    if sub_request_id in seq_id_to_seq_group:
+                        del seq_id_to_seq_group[sub_request_id]
+
             return cls.from_seq_group(assembled_seq_group, use_cache,
                                       seq_id_to_seq_group)
 

From db2f8d915ccfb4445bbac4e01ad2222260903868 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 24 Apr 2025 14:57:17 +0800
Subject: [PATCH 36/81] [V1] Update structured output (#16812)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 .../benchmark_serving_structured_output.py       | 12 ++++++------
 docs/source/features/structured_outputs.md       | 14 +++++++-------
 .../openai_chat_completion_structured_outputs.py | 16 ++++++++--------
 ...mpletion_structured_outputs_with_reasoning.py | 12 ++++++------
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 5dd9b1dbd461..74ee00ec8930 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -150,17 +150,17 @@ def get_schema(index: int):
 
     elif args.dataset == "grammar":
         schema = """
-            ?start: select_statement
+        root ::= select_statement
 
-            ?select_statement: "SELECT " column_list " FROM " table_name
+        select_statement ::= "SELECT " column " from " table " where " condition
 
-            ?column_list: column_name ("," column_name)*
+        column ::= "col_1 " | "col_2 "
 
-            ?table_name: identifier
+        table ::= "table_1 " | "table_2 "
 
-            ?column_name: identifier
+        condition ::= column "= " number
 
-            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+        number ::= "1 " | "2 "
         """
         prompt = "Generate an SQL query to show the 'username' \
             and 'email' from the 'users' table."
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index de3c5bf5e7ab..4786a7d80cf2 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -50,7 +50,7 @@ completion = client.chat.completions.create(
             "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
         }
     ],
-    extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]},
+    extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
 )
 print(completion.choices[0].message.content)
 ```
@@ -105,17 +105,17 @@ It works by using a context free EBNF grammar, which for example we can use to d
 
 ```python
 simplified_sql_grammar = """
-    ?start: select_statement
+    root ::= select_statement
 
-    ?select_statement: "SELECT " column_list " FROM " table_name
+    select_statement ::= "SELECT " column " from " table " where " condition
 
-    ?column_list: column_name ("," column_name)*
+    column ::= "col_1 " | "col_2 "
 
-    ?table_name: identifier
+    table ::= "table_1 " | "table_2 "
 
-    ?column_name: identifier
+    condition ::= column "= " number
 
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+    number ::= "1 " | "2 "
 """
 
 completion = client.chat.completions.create(
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index 986ff500e586..1b690d19b4e8 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -33,7 +33,7 @@
         "content": prompt,
     }],
     extra_body={
-        "guided_regex": "\w+@\w+\.com\n",
+        "guided_regex": r"\w+@\w+\.com\n",
         "stop": ["\n"]
     },
 )
@@ -70,17 +70,17 @@ class CarDescription(BaseModel):
 
 # Guided decoding by Grammar
 simplified_sql_grammar = """
-    ?start: select_statement
+    root ::= select_statement
 
-    ?select_statement: "SELECT " column_list " FROM " table_name
+    select_statement ::= "SELECT " column " from " table " where " condition
 
-    ?column_list: column_name ("," column_name)*
+    column ::= "col_1 " | "col_2 "
 
-    ?table_name: identifier
+    table ::= "table_1 " | "table_2 "
 
-    ?column_name: identifier
+    condition ::= column "= " number
 
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+    number ::= "1 " | "2 "
 """
 
 prompt = ("Generate an SQL query to show the 'username' and 'email'"
@@ -110,7 +110,7 @@ class CarDescription(BaseModel):
             "content": prompt,
         }],
         extra_body={
-            "guided_regex": "\w+@\w+\.com\n",
+            "guided_regex": r"\w+@\w+\.com\n",
             "stop": ["\n"],
             "guided_decoding_backend": "xgrammar:no-fallback"
         },
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index 9ceeae8fa96a..be634401679c 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -101,17 +101,17 @@ class CarDescription(BaseModel):
 
 # Guided decoding by Grammar
 simplified_sql_grammar = """
-    ?start: select_statement
+    root ::= select_statement
 
-    ?select_statement: "SELECT " column_list " FROM " table_name
+    select_statement ::= "SELECT " column " from " table " where " condition
 
-    ?column_list: column_name ("," column_name)*
+    column ::= "col_1 " | "col_2 "
 
-    ?table_name: identifier
+    table ::= "table_1 " | "table_2 "
 
-    ?column_name: identifier
+    condition ::= column "= " number
 
-    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+    number ::= "1 " | "2 "
 """
 
 # This may be very slow https://github.com/vllm-project/vllm/issues/12122

From 9c1244de5732d7ea8db5f71d5c54c6dcf05089c4 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 24 Apr 2025 15:58:08 +0800
Subject: [PATCH 37/81] [doc] update to hyperlink (#17096)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 docs/source/features/quantization/auto_awq.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index b703d0195319..d8773dbd7b20 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -12,7 +12,7 @@ You can quantize your own models by installing AutoAWQ or picking one of the [65
 pip install autoawq
 ```
 
-After installing AutoAWQ, you are ready to quantize a model. Please refer to the `AutoAWQ documentation <https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization>`_ for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
+After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
 
 ```python
 from awq import AutoAWQForCausalLM

From 2bc0f72ae5a404224bc7be2a81c65d20b387d53a Mon Sep 17 00:00:00 2001
From: omer-dayan <omdayan@nvidia.com>
Date: Thu, 24 Apr 2025 11:03:21 +0300
Subject: [PATCH 38/81] Add docs for runai_streamer_sharded (#17093)

Signed-off-by: Omer Dayan (SW-GPU) <omer@run.ai>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../models/extensions/runai_model_streamer.md | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md
index 99c37876a01b..e0daa6f86dde 100644
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/source/models/extensions/runai_model_streamer.md
@@ -51,3 +51,29 @@ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer -
 :::{note}
 For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
 :::
+
+## Sharded Model Loading
+
+vLLM also supports loading sharded models using Run:ai Model Streamer. This is particularly useful for large models that are split across multiple files. To use this feature, use the `--load-format runai_streamer_sharded` flag:
+
+```console
+vllm serve /path/to/sharded/model --load-format runai_streamer_sharded
+```
+
+The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`:
+
+```console
+vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
+```
+
+To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
+
+The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
+
+```console
+vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'
+```
+
+:::{note}
+The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint.
+:::

From b411418ff090a168c85eab243b14b7350bf73db4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 24 Apr 2025 02:49:33 -0700
Subject: [PATCH 39/81] [Chore] Remove Sampler from Model Code (#17084)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/spec_decode/test_scorer.py              |  5 ++---
 vllm/model_executor/models/arctic.py          | 10 ---------
 vllm/model_executor/models/aria.py            | 12 +----------
 vllm/model_executor/models/aya_vision.py      | 16 --------------
 vllm/model_executor/models/baichuan.py        | 10 ---------
 vllm/model_executor/models/bamba.py           | 10 ---------
 vllm/model_executor/models/bart.py            | 10 ---------
 vllm/model_executor/models/blip2.py           | 18 +---------------
 vllm/model_executor/models/bloom.py           | 10 ---------
 vllm/model_executor/models/chameleon.py       | 10 ---------
 vllm/model_executor/models/chatglm.py         | 10 ---------
 vllm/model_executor/models/commandr.py        | 10 ---------
 vllm/model_executor/models/dbrx.py            | 10 ---------
 vllm/model_executor/models/deepseek.py        | 10 ---------
 vllm/model_executor/models/deepseek_mtp.py    | 11 ----------
 vllm/model_executor/models/deepseek_v2.py     | 10 ---------
 vllm/model_executor/models/deepseek_vl2.py    | 16 --------------
 vllm/model_executor/models/eagle.py           | 13 ------------
 vllm/model_executor/models/exaone.py          | 11 ----------
 vllm/model_executor/models/falcon.py          | 10 ---------
 vllm/model_executor/models/florence2.py       | 21 -------------------
 vllm/model_executor/models/fuyu.py            | 13 ------------
 vllm/model_executor/models/gemma.py           | 10 ---------
 vllm/model_executor/models/gemma2.py          | 10 ---------
 vllm/model_executor/models/gemma3.py          | 10 ---------
 vllm/model_executor/models/gemma3_mm.py       | 16 ++------------
 vllm/model_executor/models/glm4.py            | 10 ---------
 vllm/model_executor/models/gpt2.py            | 10 ---------
 vllm/model_executor/models/gpt_bigcode.py     | 10 ---------
 vllm/model_executor/models/gpt_j.py           | 10 ---------
 vllm/model_executor/models/gpt_neox.py        | 10 ---------
 vllm/model_executor/models/granite.py         | 11 ----------
 vllm/model_executor/models/granitemoe.py      | 11 ----------
 .../model_executor/models/granitemoeshared.py | 11 ----------
 vllm/model_executor/models/grok1.py           | 10 ---------
 vllm/model_executor/models/idefics3.py        | 10 ---------
 vllm/model_executor/models/interfaces_base.py |  9 --------
 vllm/model_executor/models/internlm2.py       | 12 +----------
 vllm/model_executor/models/internvl.py        | 18 +---------------
 vllm/model_executor/models/jais.py            | 10 ---------
 vllm/model_executor/models/jamba.py           | 10 ---------
 vllm/model_executor/models/kimi_vl.py         | 12 +----------
 vllm/model_executor/models/llama.py           |  8 -------
 vllm/model_executor/models/llava.py           | 16 --------------
 vllm/model_executor/models/llava_next.py      | 16 --------------
 .../model_executor/models/llava_next_video.py | 16 --------------
 vllm/model_executor/models/llava_onevision.py | 16 --------------
 vllm/model_executor/models/mamba.py           | 10 ---------
 vllm/model_executor/models/mamba2.py          | 10 ---------
 vllm/model_executor/models/minicpm.py         | 10 ---------
 vllm/model_executor/models/minicpmv.py        | 18 +---------------
 vllm/model_executor/models/minimax_text_01.py | 12 -----------
 vllm/model_executor/models/mistral3.py        | 16 --------------
 vllm/model_executor/models/mixtral.py         | 10 ---------
 vllm/model_executor/models/mixtral_quant.py   | 10 ---------
 vllm/model_executor/models/mllama.py          | 10 ---------
 vllm/model_executor/models/mllama4.py         | 13 ------------
 vllm/model_executor/models/molmo.py           | 12 +----------
 vllm/model_executor/models/mpt.py             | 10 ---------
 vllm/model_executor/models/nemotron.py        | 11 ----------
 vllm/model_executor/models/nemotron_nas.py    |  8 -------
 vllm/model_executor/models/olmo.py            | 10 ---------
 vllm/model_executor/models/olmo2.py           | 10 ---------
 vllm/model_executor/models/olmoe.py           | 10 ---------
 vllm/model_executor/models/opt.py             | 10 ---------
 vllm/model_executor/models/orion.py           | 10 ---------
 vllm/model_executor/models/paligemma.py       | 14 +------------
 vllm/model_executor/models/persimmon.py       | 10 ---------
 vllm/model_executor/models/phi.py             | 10 ---------
 vllm/model_executor/models/phi3_small.py      | 13 +-----------
 vllm/model_executor/models/phi3v.py           | 16 --------------
 vllm/model_executor/models/phi4mm.py          | 10 ---------
 vllm/model_executor/models/phimoe.py          | 10 ---------
 vllm/model_executor/models/pixtral.py         | 15 -------------
 vllm/model_executor/models/plamo2.py          | 10 ---------
 vllm/model_executor/models/qwen.py            | 10 ---------
 vllm/model_executor/models/qwen2.py           | 10 ---------
 .../models/qwen2_5_omni_thinker.py            | 17 +--------------
 vllm/model_executor/models/qwen2_5_vl.py      | 17 +--------------
 vllm/model_executor/models/qwen2_audio.py     | 16 --------------
 vllm/model_executor/models/qwen2_moe.py       | 10 ---------
 vllm/model_executor/models/qwen2_vl.py        | 17 +--------------
 vllm/model_executor/models/qwen3.py           | 10 ---------
 vllm/model_executor/models/qwen3_moe.py       | 10 ---------
 vllm/model_executor/models/skyworkr1v.py      | 18 +---------------
 vllm/model_executor/models/solar.py           | 11 ----------
 vllm/model_executor/models/stablelm.py        | 10 ---------
 vllm/model_executor/models/starcoder2.py      | 10 ---------
 vllm/model_executor/models/transformers.py    |  9 --------
 vllm/model_executor/models/ultravox.py        | 16 --------------
 vllm/model_executor/models/whisper.py         | 10 ---------
 vllm/model_executor/models/zamba2.py          | 19 -----------------
 vllm/spec_decode/draft_model_runner.py        |  2 +-
 vllm/spec_decode/multi_step_worker.py         |  5 ++---
 vllm/spec_decode/spec_decode_worker.py        |  4 ++--
 vllm/v1/worker/gpu_model_runner.py            | 16 ++++++++------
 vllm/worker/cpu_enc_dec_model_runner.py       |  2 +-
 vllm/worker/cpu_model_runner.py               | 10 +++------
 vllm/worker/enc_dec_model_runner.py           |  2 +-
 vllm/worker/hpu_model_runner.py               |  7 ++++---
 vllm/worker/model_runner.py                   |  5 +++--
 vllm/worker/multi_step_model_runner.py        |  3 +--
 vllm/worker/xpu_model_runner.py               |  5 +++--
 103 files changed, 48 insertions(+), 1099 deletions(-)

diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 161cc9fbf556..f73cf4b345fb 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -62,9 +62,8 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
     scorer_worker = create_worker(Worker, model_name, block_size,
                                   num_gpu_blocks, seed)
     scorer_worker.model_runner.disable_logprobs = True  # accessed by mqa_scorer
-    scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True
-    scorer_worker.model_runner.model.sampler.\
-        should_modify_greedy_probs_inplace = True
+    scorer_worker.model_runner.sampler.include_gpu_probs_tensor = True
+    scorer_worker.model_runner.sampler.should_modify_greedy_probs_inplace = True
 
     vocab_size = scorer_worker.vocab_size
 
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 065715cbde4e..dfe8f20c70d6 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -24,7 +24,6 @@
 from vllm.model_executor.layers.quantization.deepspeedfp import (
     DeepSpeedFPConfig, DeepSpeedFPParameter)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -435,7 +434,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.unpadded_vocab_size = config.vocab_size
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -462,14 +460,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index edf67c860e97..7c716efab8ef 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -15,11 +15,10 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import (SamplerOutput,
-                                                SamplingMetadata, get_sampler)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargs)
@@ -527,7 +526,6 @@ def __init__(
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.vocab_size, logit_scale)
-        self.sampler = get_sampler()
 
     def _validate_image_sizes(
             self, images: List[torch.Tensor]) -> List[torch.Tensor]:
@@ -653,14 +651,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 8700f24d2bd2..d152287e8fa3 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0 Adapted from
 # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
-from functools import cached_property
 from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple,
                     TypedDict, Union, cast)
 
@@ -17,7 +16,6 @@
 
 from vllm.config import VllmConfig
 from vllm.jsontree import json_map_leaves
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalDataDict, MultiModalKwargs
@@ -461,17 +459,3 @@ def compute_logits(
     ) -> Optional[torch.Tensor]:
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
-
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 6a3112b5f769..444ed38d05c0 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -39,7 +39,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -396,7 +395,6 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -423,14 +421,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index dfb8f49cc014..16dac6123d66 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -24,7 +24,6 @@
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -462,7 +461,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -538,14 +536,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 04d6cde555e2..bcfbe92c3a11 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -37,7 +37,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -791,7 +790,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
     def forward(
         self,
@@ -828,14 +826,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     stacked_params_mapping = {
         "q_proj": {
             "param_name": "qkv_proj",
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index a6f00f999773..17c857ce096a 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -12,7 +11,6 @@
 from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -530,13 +528,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -649,7 +640,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> Union[SamplerOutput, IntermediateTensors]:
+    ) -> IntermediateTensors:
         """Run forward pass for BLIP-2.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -707,13 +698,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index f960075b98bc..74d401b295ce 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -297,7 +296,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                           self.config.hidden_size)
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -324,14 +322,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 0ad5e89df2e2..e2c275300f8c 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -22,7 +22,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -950,7 +949,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -1054,14 +1052,6 @@ def compute_logits(
 
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 1b1738f882b7..233e9ee0a258 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -21,7 +21,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -429,7 +428,6 @@ def __init__(
                 self.transformer.embedding.weight)
         self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -442,14 +440,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 8912affe36fe..25b1d5a1955f 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -372,7 +371,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 scale=config.logit_scale)
         self.model = CohereModel(vllm_config=vllm_config,
                                  prefix=maybe_prefix(prefix, "model"))
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -406,14 +404,6 @@ def compute_logits(
 
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index b66529860bc2..40c0a73f52d5 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -16,7 +16,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -390,7 +389,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -417,14 +415,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         expert_params_mapping = [(
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 5e036d049a8a..c6421143dd68 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -43,7 +43,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -453,7 +452,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -480,14 +478,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index e7fde76cd0ba..b50175cf764f 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -10,7 +10,6 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -154,8 +153,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                  prefix=maybe_prefix(
                                                      prefix, "model"))
 
-        self.sampler = get_sampler()
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -179,14 +176,6 @@ def compute_logits(
         return self.model.compute_logits(hidden_states, sampling_metadata,
                                          spec_step_idx)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 23b450aeddac..ffa5840b4604 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -44,7 +44,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -686,7 +685,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -713,14 +711,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def make_empty_intermediate_tensors(
             self, batch_size: int, dtype: torch.dtype,
             device: torch.device) -> IntermediateTensors:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index c3dbadb29276..ac136698ee17 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -4,7 +4,6 @@
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -16,7 +15,6 @@
 from vllm.config import VllmConfig
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -393,13 +391,6 @@ def _init_vision_module(
         model = model.to(dtype=torch.get_default_dtype())
         return model
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -647,13 +638,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 3e4a5040b7c8..4ff1e785494f 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -9,7 +9,6 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -131,10 +130,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # checkpoint file has token_map tensor.
         self.token_map = None
 
-    @property
-    def sampler(self):
-        return self.model.sampler
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.model.get_input_embeddings(input_ids)
 
@@ -188,14 +183,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
 
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
         # due to missing lm_head weights and its config being that of a
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 553c524ebc37..4a6490cd127a 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -510,8 +509,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -538,14 +535,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 0e67b1ec94f6..e7e03fc09972 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -473,7 +472,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 quant_config=quant_config,
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -500,14 +498,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 359cc7f37731..d1a36c3f481a 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -3,7 +3,6 @@
 import math
 from collections import OrderedDict
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -14,7 +13,6 @@
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
                                              BartParallelLMHead,
@@ -673,7 +671,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
     def forward(
         self,
@@ -716,11 +713,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> SamplerOutput:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -929,12 +921,6 @@ def _build_image_projection_layers(self, config: PretrainedConfig):
             raise NotImplementedError(
                 'Florence2 only supports COSINE as temporal embedding.')
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-        return get_sampler()
-
     def _validate_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -1110,13 +1096,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 27cd8d0986a5..d6bd6155a447 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -27,7 +27,6 @@
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -270,10 +269,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @property
-    def sampler(self):
-        return self.language_model.sampler
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
 
         h = w = self.config.patch_size
@@ -387,14 +382,6 @@ def compute_logits(
             self.language_model.lm_head, hidden_states, sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.language_model.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 92d99883c774..c1cc0df11178 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -35,7 +35,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -388,7 +387,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = GemmaModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -415,14 +413,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index d125c666f3cd..35e698fca410 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -388,7 +387,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                  prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -415,14 +413,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index fb8eccc55078..34957300ff9a 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -493,7 +492,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                  prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -521,14 +519,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index e5a3d6762fff..6d3e6e389c47 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Any, Literal, Optional, Set, Tuple, TypedDict
 
 import torch
 from torch import nn
@@ -12,7 +12,6 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -503,10 +502,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def dtype(self):
         return next(self.parameters()).dtype
 
-    @property
-    def sampler(self):
-        return self.language_model.sampler
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -607,7 +602,7 @@ def forward(self,
                 positions: torch.Tensor,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
+                **kwargs: object) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -704,13 +699,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index 28cebfbd7baa..290be968cb54 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -37,7 +37,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
@@ -267,7 +266,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -295,14 +293,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 776c03f652bd..e3219333915e 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -255,7 +254,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = self.lm_head.tie_weights(self.transformer.wte)
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -282,14 +280,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 40d01a2ecfc5..def6b1544d8c 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -302,7 +301,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -329,14 +327,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 1f73d2ab0f1a..3db96fb8e187 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -306,7 +305,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -333,14 +331,6 @@ def compute_logits(
                                        sampling_metadata, self.lm_head.bias)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 582b2ff7e755..620ee66f57e7 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -299,7 +298,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.embed_out.weight = self.gpt_neox.embed_in.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.gpt_neox.make_empty_intermediate_tensors)
 
@@ -326,14 +324,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 3bd6332c11ca..0696a7245c22 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -441,8 +440,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -464,14 +461,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def make_empty_intermediate_tensors(
             self, batch_size: int, dtype: torch.dtype,
             device: torch.device) -> IntermediateTensors:
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 367722126e56..7fff14cb9f12 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -391,8 +390,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 scale=1 /
                                                 self.config.logits_scaling)
 
-        self.sampler = get_sampler()
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -428,14 +425,6 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index cf8c969e118f..4e660cbf667b 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -20,7 +20,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -295,8 +294,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 scale=1 /
                                                 self.config.logits_scaling)
 
-        self.sampler = get_sampler()
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -332,14 +329,6 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index ef96257ba4bb..c48cb157084d 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -39,7 +39,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -521,7 +520,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size,
                                                 self.output_multiplier_scale)
 
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -551,14 +549,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         skip_prefixes = ["rotary_emb.inv_freq"]
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index c31870461b4c..961954c2b584 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -28,7 +28,6 @@
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -603,7 +602,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.text_config.tie_word_embeddings:
             self.lm_head.weight = self.model.text_model.wte.weight
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
-        self.sampler = get_sampler()
 
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
@@ -754,14 +752,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 22c9287509ed..f141dcf3cd4f 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -13,7 +13,6 @@
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.pooler import PoolerOutput
-    from vllm.model_executor.layers.sampler import SamplerOutput
     from vllm.model_executor.pooling_metadata import PoolingMetadata
     from vllm.model_executor.sampling_metadata import SamplingMetadata
 
@@ -103,14 +102,6 @@ def compute_logits(
         """Return `None` if TP rank > 0."""
         ...
 
-    def sample(
-        self,
-        logits: T,
-        sampling_metadata: "SamplingMetadata",
-    ) -> "SamplerOutput":
-        """Only called on TP rank 0."""
-        ...
-
 
 @overload
 def is_text_generation_model(
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 520b85c0cdfb..c3d7cbfcddbb 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -23,7 +23,6 @@
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -336,7 +335,6 @@ def __init__(self,
         if self.config.tie_word_embeddings:
             self.output.weight = self.model.tok_embeddings.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -363,14 +361,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
@@ -423,7 +413,7 @@ def __init__(
                          prefix=prefix,
                          model_type=model_type)
 
-        for attr in ("output", "logits_processor", "sampler"):
+        for attr in ("output", "logits_processor"):
             delattr(self, attr)
 
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 8f5f454cbf60..23b92ad2bbf6 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -8,7 +8,6 @@
 # --------------------------------------------------------
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
 
 import torch
@@ -20,7 +19,6 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -698,13 +696,6 @@ def _patch_quant_config(self, config: PretrainedConfig,
                 (llm_quant_config is not None):
                 quant_config.modules_to_not_convert.append("vision_model")
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _init_vision_model(
         self,
         config: PretrainedConfig,
@@ -903,7 +894,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> Union[SamplerOutput, IntermediateTensors]:
+    ) -> IntermediateTensors:
 
         if intermediate_tensors is not None:
             input_ids = None
@@ -941,13 +932,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 78fe6588eddc..e1e3f0f199c5 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -36,7 +36,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -308,7 +307,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                         config.mup_width_scale)
         self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size,
                                                 scale=self.output_logits_scale)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -335,14 +333,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 6fabc8228e18..46335c2b3930 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -19,7 +19,6 @@
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -409,7 +408,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -466,14 +464,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 1520f6992f0a..5c39907d79e4 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -58,7 +58,6 @@
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -298,7 +297,6 @@ def __init__(
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = get_sampler()
         self.media_placeholder: int = self.config.media_placeholder_token_id
         self.tp_rank = get_tensor_model_parallel_rank()
         self.tp_world_size = get_tensor_model_parallel_world_size()
@@ -409,7 +407,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> SamplerOutput:
+    ) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
         # NOTE: In v1, inputs_embeds is always generated at model runner from
@@ -447,14 +445,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata, **kwargs)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         config = self.config.text_config
         _KEYS_TO_MODIFY_MAPPING = {
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index caa4a5108a92..31ffa4e1e63c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -515,8 +514,6 @@ def __init__(self,
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -551,11 +548,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index fbd212d17004..8862b2679f93 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -2,7 +2,6 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
                     TypeVar, Union, cast)
 
@@ -23,7 +22,6 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -546,13 +544,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -763,13 +754,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 9c4d0e1fc275..c646c0f03d1e 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import abstractmethod
-from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, TypeVar, Union)
 
@@ -13,7 +12,6 @@
 from typing_extensions import NotRequired
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig
@@ -250,13 +248,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
 
@@ -585,13 +576,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 0221c6b237cb..a5ff189cfdb5 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -2,7 +2,6 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -12,7 +11,6 @@
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -301,13 +299,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_video_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
@@ -469,13 +460,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index ab2bf881ad65..5c2b388e403d 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -2,7 +2,6 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple,
                     TypedDict, Union)
 
@@ -16,7 +15,6 @@
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -455,13 +453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
 
@@ -957,13 +948,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ac95b65fd03f..af78ece66bbe 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -14,7 +14,6 @@
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -213,7 +212,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors)
@@ -267,14 +265,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 526dec46ff29..78303733f6bb 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -19,7 +19,6 @@
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -208,7 +207,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors)
@@ -282,14 +280,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index cf03396a9ca9..866dc3f466e7 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -45,7 +45,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -553,7 +552,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -584,14 +582,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1a91cf9bab47..65a26eadd5c8 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -25,7 +25,7 @@
 import math
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property, partial
+from functools import partial
 from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
                     Union)
 
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
@@ -758,13 +757,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.llm.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.llm, "sampler"):
-            return self.llm.sampler
-
-        return get_sampler()
-
     def _parse_and_validate_vision_input(
         self,
         modality: str,
@@ -946,14 +938,6 @@ def compute_logits(
     ) -> Optional[torch.Tensor]:
         return self.llm.compute_logits(hidden_states, sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 7562aa678d5a..74be08159cd8 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -33,7 +33,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -994,7 +993,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     self.config.vocab_size)
 
-            self.sampler = Sampler()
         else:
             self.lm_head = PPMissingLayer()
 
@@ -1030,16 +1028,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
 
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ):
-
-        next_tokens = self.sampler(logits, sampling_metadata)
-
-        return next_tokens
-
     def make_empty_intermediate_tensors(
             self, batch_size: int, dtype: torch.dtype,
             device: torch.device) -> IntermediateTensors:
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 8b1a1d68fc3f..94cbb20cee5f 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -2,7 +2,6 @@
 
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
                     TypeVar, Union)
 
@@ -19,7 +18,6 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -435,13 +433,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -598,13 +589,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index b0ac99f21ead..1513c8dad097 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -454,7 +453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -481,14 +479,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 96eb925cf894..7c022a5b8f68 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -42,7 +42,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -372,7 +371,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -399,14 +397,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 7bfb3ada6bb4..0c1d61c01f91 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -47,7 +47,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -1211,7 +1210,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(config.output_hidden_states,
                                                 config.text_config.vocab_size)
-        self.sampler = get_sampler()
 
     def compute_logits(
         self,
@@ -1222,14 +1220,6 @@ def compute_logits(
                                        hidden_states, sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def unpack_data(self,
                     image_data: Union[List[torch.Tensor], torch.Tensor],
                     padding_value=0) -> torch.Tensor:
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 69e3ea8bd063..56a7f02c4159 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 import math
 from collections.abc import Iterable, Mapping
-from functools import cached_property
 from itertools import tee
 from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
@@ -38,7 +37,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import _initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -682,13 +680,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Llama4ImagePatchInputs]:
         # num_images, 1, num_chunks, channel, image_size, image_size
@@ -785,10 +776,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def separate_weights(
         self,
         weights: Iterable[Tuple[str, torch.Tensor]],
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index d75845b45e73..46147a333b06 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -35,7 +35,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -1394,7 +1393,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(config.embedding_size
                                                 or config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -1506,7 +1504,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
 
         if intermediate_tensors is not None:
             inputs_embeds = None
@@ -1532,14 +1530,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index b30f3ee37997..77bd794058cd 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -18,7 +18,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -298,7 +297,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                     prefix=maybe_prefix(prefix, "transformer"))
         self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -325,14 +323,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 0ea296b2f93d..5208c0796c8d 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -416,8 +415,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -444,14 +441,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index 5c9b04cab180..264999496876 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -34,7 +34,6 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -408,8 +407,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -439,11 +436,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 4a341c97d6cd..0781ca168f84 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -39,7 +39,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -309,7 +308,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 quant_config=quant_config,
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -340,14 +338,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index f9427cdadf7a..e7a5f91a3dae 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -42,7 +42,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -339,7 +338,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -367,14 +365,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 296bac51d012..9bed29d0132f 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -31,7 +31,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -417,7 +416,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                       config.hidden_size,
                                       quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -442,14 +440,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 14aae2fbf817..d258eddae25d 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -375,7 +374,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = ParallelLMHead(config.vocab_size,
                                           config.word_embed_proj_dim)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -402,14 +400,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 5d35234544c3..8d9c000750d7 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -22,7 +22,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -316,7 +315,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -343,14 +341,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 6c1bd499f639..8699ae52622d 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -8,7 +8,6 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -260,10 +259,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @property
-    def sampler(self):
-        return self.language_model.sampler
-
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
         expected_dims = (3, h, w)
@@ -369,7 +364,7 @@ def forward(self,
                 positions: torch.Tensor,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
-                **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
+                **kwargs: object) -> IntermediateTensors:
         if intermediate_tensors is not None:
             inputs_embeds = None
 
@@ -396,13 +391,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 15afea82f3a0..6e26658c9261 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -306,7 +305,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                       config.hidden_size,
                                       bias=False)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -337,14 +335,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index fdf7734595a5..fc2b108bad97 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -53,7 +53,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -322,7 +321,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                       bias=True,
                                       quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -350,14 +348,6 @@ def compute_logits(
                                        sampling_metadata, self.lm_head.bias)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 7b02c9edfad2..338e87b4285f 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -17,7 +17,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -396,7 +395,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -437,6 +435,7 @@ def compute_logits(
                                        sampling_metadata)
         if self.dummy_token_indices is not None and logits is not None:
             logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
+        logits = logits / self.mup_width_multiplier
         return logits
 
     def forward(
@@ -455,16 +454,6 @@ def forward(
         output_hidden_states = output_hidden_states
         return output_hidden_states
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-
-        next_tokens = self.sampler(logits / self.mup_width_multiplier,
-                                   sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5b43871b7591..a1442251b992 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -16,7 +16,6 @@
 # limitations under the License.
 import re
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -27,7 +26,6 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -555,13 +553,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
 
@@ -716,13 +707,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 03ca143f9c08..6035994f4336 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -13,7 +13,6 @@
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.models.llama import LlamaModel
@@ -968,7 +967,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = get_sampler()
 
     def _parse_and_validate_audio_input(
             self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
@@ -1244,14 +1242,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> None:
         weights = ((name, data) for name, data in weights
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 381a33d98b9c..2dc55e4c352e 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -40,7 +40,6 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -634,7 +633,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -659,14 +657,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 38e140a91ecf..58323d639d5d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -28,7 +28,6 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
@@ -331,13 +330,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[PixtralImagePixelInputs]:
         images = kwargs.pop("images", None)
@@ -441,13 +433,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]):
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index fb1442526c6c..790c48ccd216 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -24,7 +24,6 @@
     selective_scan_fn, selective_state_update)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -628,7 +627,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 self.config.vocab_size)
-        self.sampler = get_sampler()
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -684,14 +682,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index a33739a8eef9..e75294bc6cba 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -24,7 +24,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -273,7 +272,6 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.transformer.wte.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
@@ -286,14 +284,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 2831a5a12330..f76f31c9fc8d 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -43,7 +43,6 @@
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -450,7 +449,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -478,14 +476,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 19bdae1de70a..93c21fc55c5c 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -22,7 +22,7 @@
 """Inference-only Qwen2.5-Omni model (thinker part)."""
 
 from copy import copy
-from functools import cached_property, partial
+from functools import partial
 from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence,
                     Set, Tuple, Union)
 
@@ -40,7 +40,6 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.qwen2_5_vl import (
     Qwen2_5_VisionTransformer, Qwen2_5_VLImageEmbeddingInputs,
     Qwen2_5_VLImageInputs, Qwen2_5_VLImagePixelInputs,
@@ -790,13 +789,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         mm_input_by_modality = {}
 
@@ -937,13 +929,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 0ab55411bad4..76175d79fb53 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -24,7 +24,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
-from functools import cached_property, partial
+from functools import partial
 from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
@@ -51,7 +51,6 @@
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -833,13 +832,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
         # seems to avoid vision encoder sections for some models.
@@ -1127,13 +1119,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 280cda0f68f1..0cb541c6cbb2 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -22,7 +22,6 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Any, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -34,7 +33,6 @@
 from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
@@ -267,13 +265,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _validate_and_reshape_mm_tensor(self, mm_input: object,
                                         name: str) -> torch.Tensor:
         if not isinstance(mm_input, (torch.Tensor, list)):
@@ -405,13 +396,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 2700c706b972..62696678b7af 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -47,7 +47,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -497,7 +496,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -524,14 +522,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8c24b8f7df52..8c2895ce9d18 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -24,7 +24,7 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property, partial
+from functools import partial
 from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
                     Union)
 
@@ -51,7 +51,6 @@
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -1112,13 +1111,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
         # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
         # seems to avoid vision encoder sections for some models.
@@ -1400,13 +1392,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 9c14038e6113..73d2838f461e 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -38,7 +38,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
@@ -283,7 +282,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
@@ -311,14 +309,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index f0ef79dfdfe2..70f9956e3efc 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -44,7 +44,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -494,7 +493,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -521,14 +519,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 19a23162aa84..e78c37b65f87 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -8,7 +8,6 @@
 # --------------------------------------------------------
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
 
 import torch
@@ -21,7 +20,6 @@
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -699,13 +697,6 @@ def _patch_quant_config(self, config: PretrainedConfig,
                 (llm_quant_config is not None):
                 quant_config.modules_to_not_convert.append("vision_model")
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def _init_vision_model(
         self,
         config: PretrainedConfig,
@@ -908,7 +899,7 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> Union[SamplerOutput, IntermediateTensors]:
+    ) -> IntermediateTensors:
 
         if intermediate_tensors is not None:
             input_ids = None
@@ -946,13 +937,6 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         skip_prefixes = [
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 1cae0a7fe0dc..f86aff7ba7ef 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -41,7 +41,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -418,8 +417,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -440,14 +437,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 53f520304abc..13460d1dfd49 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -36,7 +36,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -310,7 +309,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -337,14 +335,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 8b9fb7cb7bc6..6eebe4c4d614 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -36,7 +36,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -317,7 +316,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -344,14 +342,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 212e35e79d45..a37e88a387fd 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -35,7 +35,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -396,8 +395,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -435,12 +432,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index cb5ff4ed6365..bfa48099b741 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,7 +3,6 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 from collections.abc import Iterable, Mapping, Sequence
-from functools import cached_property
 from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
@@ -18,7 +17,6 @@
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -438,13 +436,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
-    @cached_property
-    def sampler(self):
-        if hasattr(self.language_model, "sampler"):
-            return self.language_model.sampler
-
-        return get_sampler()
-
     def get_mm_mapping(self) -> MultiModelKeys:
         """
         Get the module prefix in multimodal models
@@ -628,13 +619,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
         return self.language_model.compute_logits(hidden_states,
                                                   sampling_metadata)
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        return self.language_model.sample(logits, sampling_metadata)
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
 
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 63e71f268805..908cd7885aa8 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -21,7 +21,6 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -669,7 +668,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = Sampler()
 
     def forward(
         self,
@@ -724,14 +722,6 @@ def compute_logits(self, hidden_states: torch.Tensor,
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index ea21fffaede5..d34033e3ac90 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -31,7 +31,6 @@
     MambaMixer2, extra_groups_for_head_shards)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -870,7 +869,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         # Initialize logits processing and sampling
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = get_sampler()
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         """Convert input token IDs to embeddings.
@@ -1004,23 +1002,6 @@ def compute_logits(
                                        sampling_metadata)
         return logits
 
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        """Sample next tokens from computed logits.
-        
-        Args:
-            logits: Computed logits for next token prediction
-            sampling_metadata: Metadata for sampling process
-            
-        Returns:
-            Sampled tokens and related sampling information
-        """
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 3ad9b4993327..24095ef2a567 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -295,7 +295,7 @@ def execute_model(
             if not self.is_driver_worker:
                 return []
             # Sample the next token.
-            output = self.model.sample(
+            output = self.model_runner.sampler(
                 logits=logits,
                 sampling_metadata=model_input.sampling_metadata,
             )
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index d8d54918fa98..6473740ae512 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -50,11 +50,10 @@ def init_device(self) -> None:
 
     def set_include_gpu_probs_tensor(self) -> None:
         # Need include_gpu_probs_tensor for MultiStepWorker
-        self.model_runner.model.sampler.include_gpu_probs_tensor = True
+        self.model_runner.sampler.include_gpu_probs_tensor = True
 
     def set_should_modify_greedy_probs_inplace(self) -> None:
-        self.model_runner.model.sampler.should_modify_greedy_probs_inplace = (
-            True)
+        self.model_runner.sampler.should_modify_greedy_probs_inplace = True
 
     @torch.inference_mode()
     def sampler_output(
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index a724beade129..4e79003de391 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -410,9 +410,9 @@ def _configure_model_sampler_for_spec_decode(self):
         NOTE(cade): This will require a special check if the proposer worker
         does not have a sampler (e.g. ngram speculation).
         """
-        (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor
+        (self.scorer_worker.model_runner.sampler.include_gpu_probs_tensor
          ) = True
-        (self.scorer_worker.model_runner.model.sampler.
+        (self.scorer_worker.model_runner.sampler.
          should_modify_greedy_probs_inplace) = True
         self.proposer_worker.set_include_gpu_probs_tensor()
         self.proposer_worker.set_should_modify_greedy_probs_inplace()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c12e4fd555d2..86f6a301fbb6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -38,6 +38,7 @@
                              ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
+from vllm.v1.sample.sampler import Sampler
 from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
@@ -153,6 +154,9 @@ def __init__(
         self.max_num_encoder_input_tokens = encoder_compute_budget
         self.encoder_cache_size = encoder_cache_size
 
+        # Sampler
+        self.sampler = Sampler()
+
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
         self.kv_caches: list[torch.Tensor] = []
@@ -1096,7 +1100,7 @@ def execute_model(
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
         if spec_decode_metadata is None:
-            sampler_output = self.model.sample(
+            sampler_output = self.sampler(
                 logits=logits,
                 sampling_metadata=sampling_metadata,
             )
@@ -1106,7 +1110,7 @@ def execute_model(
             # logits tensor. This means any in-place operations on bonus_logits
             # won't affect the original logits tensor.
             bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
-            sampler_output = self.model.sample(
+            sampler_output = self.sampler(
                 logits=bonus_logits,
                 sampling_metadata=sampling_metadata,
             )
@@ -1383,8 +1387,8 @@ def _get_prompt_logprobs_dict(
             tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits]
 
             # Compute prompt logprobs.
-            logprobs = self.model.sampler.compute_logprobs(logits)
-            token_ids, logprobs, ranks = self.model.sampler.gather_logprobs(
+            logprobs = self.sampler.compute_logprobs(logits)
+            token_ids, logprobs, ranks = self.sampler.gather_logprobs(
                 logprobs, num_prompt_logprobs, tgt_token_ids)
 
             # Transfer GPU->CPU async.
@@ -1502,8 +1506,8 @@ def _dummy_sampler_run(
             bad_words_token_ids={},
         )
         try:
-            sampler_output = self.model.sample(
-                logits=logits, sampling_metadata=dummy_metadata)
+            sampler_output = self.sampler(logits=logits,
+                                          sampling_metadata=dummy_metadata)
         except RuntimeError as e:
             if 'out of memory' in str(e):
                 raise RuntimeError(
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index ac7c93e48395..c2120c035175 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -316,7 +316,7 @@ def execute_model(
             return []
 
         # Sample the next token.
-        output = self.model.sample(
+        output = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index f22d45ed8287..29fbfbf0d375 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -19,7 +19,7 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
@@ -490,6 +490,7 @@ def __init__(
         self.model: nn.Module  # Set after init_Model
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
+        self.sampler = get_sampler()
 
         if hasattr(self, "_builder_cls"):
             # multi-step model runner does not have `_builder_cls`
@@ -545,11 +546,6 @@ def _prepare_model_input_tensors(
 
         return self.builder.build()  # type: ignore
 
-    # sampler property will be used by spec_decode_worker
-    @property
-    def sampler(self):
-        return self.model.sampler
-
     @property
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
@@ -677,7 +673,7 @@ def execute_model(
             return []
 
         # Sample the next token.
-        output = self.model.sample(
+        output = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 72ff9d66a689..dee59041ec6f 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -205,7 +205,7 @@ def execute_model(
             model_input.async_callback()
 
         # Sample the next token.
-        output: SamplerOutput = self.model.sample(
+        output: SamplerOutput = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 2d31024b47d0..1bcef841b062 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -41,7 +41,7 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader import get_model
@@ -316,6 +316,7 @@ class HpuModelAdapter:
 
     def __init__(self, model, vllm_config):
         self.model = model
+        self.sampler = get_sampler()
         self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                                '0').lower() in ['1', 'true']
         self.vllm_config = vllm_config
@@ -454,7 +455,7 @@ def compute_logits(self, *args, **kwargs):
         return self.model.compute_logits(*args, **kwargs)
 
     def sample(self, *args, **kwargs):
-        return self.model.sample(*args, **kwargs)
+        return self.sampler(*args, **kwargs)
 
 
 class PreparePromptMetadata(NamedTuple):
@@ -2167,7 +2168,7 @@ def execute_model(
                 # in case of multi-step scheduling
                 # we only want to pythonize in the last step
                 sampling_metadata.skip_sampler_cpu_output = True
-                self.model.model.sampler.include_gpu_probs_tensor = True
+                self.model.sampler.include_gpu_probs_tensor = True
             cache_orig_output_tokens_len: List[Dict] = []
 
             def try_revert_dummy_output_tokens():
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2ebada343d0f..3b09c92ae155 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -35,7 +35,7 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import supports_lora, supports_multimodal
@@ -1094,6 +1094,7 @@ def __init__(
         # Set after load_model.
         self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
         self.prompt_adapter_manager: LRUCacheWorkerPromptAdapterManager = None
+        self.sampler = get_sampler()
 
         set_cpu_offload_max_bytes(
             int(self.cache_config.cpu_offload_gb * 1024**3))
@@ -1832,7 +1833,7 @@ def execute_model(
             model_input.async_callback()
 
         # Sample the next token.
-        output: SamplerOutput = self.model.sample(
+        output: SamplerOutput = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 7ddf382079c6..a6f5ec825635 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -488,8 +488,7 @@ def execute_model(
                     device="cpu",
                     pin_memory=True)
 
-            self._base_model_runner.model.sampler.include_gpu_probs_tensor = (
-                True)
+            self._base_model_runner.sampler.include_gpu_probs_tensor = True
             if frozen_model_input.sampling_metadata:
                 frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
                     True)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 9d49b4385dca..fc9461a24b90 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -18,7 +18,7 @@
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadataCache
-from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs, MultiModalPlaceholderMap,
@@ -410,6 +410,7 @@ def __init__(
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
+        self.sampler = get_sampler()
 
         self.sampling_metadata_cache: SamplingMetadataCache = \
               SamplingMetadataCache() \
@@ -596,7 +597,7 @@ def execute_model(
             model_input.async_callback()
 
         # Sample the next token.
-        output: SamplerOutput = self.model.sample(
+        output: SamplerOutput = self.sampler(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )

From 14288d1332602846d3f36c4a0d65b13d335cf557 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 24 Apr 2025 03:50:09 -0600
Subject: [PATCH 40/81] Disable enforce_eager for V1 TPU sampler and structured
 output tests (#17016)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh       | 1 +
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 6 +++++-
 tests/v1/tpu/test_sampler.py                            | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index 704bc6b7324d..21982b01b9cc 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -19,6 +19,7 @@ docker run --privileged --net host --shm-size=16G -it \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
     && python3 -m pip install pytest pytest-asyncio tpu-info \
     && python3 -m pip install lm_eval[api]==0.4.4 \
+    && export VLLM_XLA_CACHE_PATH= \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
     && echo HARDWARE \
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index fc8e271f7f91..1e4a80539972 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -13,6 +13,7 @@
 
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
@@ -63,10 +64,13 @@ def test_structured_output(
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
 
+    # Don't use eager execution on TPUs because we want to test for no
+    # recompilation at runtime
+    enforce_eager = bool(not current_platform.is_tpu())
     # Use a single LLM instance for several scenarios to
     # speed up the test suite.
     llm = LLM(model=model_name,
-              enforce_eager=True,
+              enforce_eager=enforce_eager,
               max_model_len=1024,
               guided_decoding_backend=guided_decoding_backend,
               tokenizer_mode=tokenizer_mode)
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 046d3e404e4f..c6b492b5a3cc 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -23,7 +23,7 @@ def test_sampler_different(model_name: str):
     different results.
     """
     llm = LLM(model_name,
-              enforce_eager=True,
+              enforce_eager=False,
               max_num_seqs=1,
               max_model_len=512,
               max_num_batched_tokens=512)

From 0a05ed57e6865cea4c00acf060dd2e6c456c0a49 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 24 Apr 2025 12:43:56 +0100
Subject: [PATCH 41/81] Simplify `TokenizerGroup` (#16790)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/conftest.py                             |  16 +-
 tests/lora/test_tokenizer_group.py            |  10 +-
 tests/tokenization/test_detokenize.py         |   9 +-
 tests/tokenization/test_tokenizer_group.py    | 187 +-------------
 tests/v1/engine/conftest.py                   |   2 +-
 tests/v1/engine/utils.py                      |   5 +-
 vllm/config.py                                |  97 ++-----
 vllm/engine/arg_utils.py                      |  26 +-
 vllm/engine/async_llm_engine.py               |   2 -
 vllm/engine/llm_engine.py                     |  30 +--
 vllm/engine/multiprocessing/client.py         |   1 -
 vllm/entrypoints/llm.py                       |   5 +-
 vllm/inputs/preprocess.py                     |   6 +-
 vllm/transformers_utils/detokenizer.py        |   4 +-
 .../{tokenizer_group => }/tokenizer_group.py  |  30 ++-
 .../tokenizer_group/__init__.py               |  56 ----
 .../tokenizer_group/base_tokenizer_group.py   |  68 -----
 .../tokenizer_group/ray_tokenizer_group.py    | 244 ------------------
 vllm/v1/engine/async_llm.py                   |   2 -
 vllm/v1/engine/llm_engine.py                  |  20 +-
 vllm/v1/engine/output_processor.py            |   4 +-
 vllm/v1/engine/processor.py                   |   4 +-
 vllm/v1/structured_output/backend_guidance.py |   2 -
 vllm/v1/structured_output/backend_xgrammar.py |   2 -
 24 files changed, 80 insertions(+), 752 deletions(-)
 rename vllm/transformers_utils/{tokenizer_group => }/tokenizer_group.py (84%)
 delete mode 100644 vllm/transformers_utils/tokenizer_group/__init__.py
 delete mode 100644 vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
 delete mode 100644 vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py

diff --git a/tests/conftest.py b/tests/conftest.py
index 25e70319e2cc..0aaf637d41d8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,7 +23,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
+from vllm.config import TaskOption, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -1010,20 +1010,6 @@ def vllm_runner():
     return VllmRunner
 
 
-def get_tokenizer_pool_config(tokenizer_group_type):
-    if tokenizer_group_type is None:
-        return None
-    if tokenizer_group_type == "ray":
-        return TokenizerPoolConfig(pool_size=1,
-                                   pool_type="ray",
-                                   extra_config={})
-    if isinstance(tokenizer_group_type, type):
-        return TokenizerPoolConfig(pool_size=1,
-                                   pool_type=tokenizer_group_type,
-                                   extra_config={})
-    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
-
-
 @pytest.fixture()
 def temporary_enable_log_propagate():
     import logging
diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index d605ab734688..8845eb33d207 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -5,17 +5,14 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import get_lora_tokenizer
-from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
-
-from ..conftest import get_tokenizer_pool_config
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
 async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
     reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files)
-    tokenizer_group = get_tokenizer_group(
-        get_tokenizer_pool_config(tokenizer_group_type),
+    tokenizer_group = TokenizerGroup(
         tokenizer_id="gpt2",
         enable_lora=True,
         max_num_seqs=1,
@@ -60,8 +57,7 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path):
 @pytest.mark.parametrize("max_num_seqs", [1, 2])
 @pytest.mark.parametrize("max_loras", [1, 2])
 def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras):
-    tokenizer_group = get_tokenizer_group(
-        get_tokenizer_pool_config(None),
+    tokenizer_group = TokenizerGroup(
         tokenizer_id="gpt2",
         enable_lora=enable_lora,
         max_num_seqs=max_num_seqs,
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 0f8b98a13581..da6c774f439a 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -10,7 +10,7 @@
 from vllm.inputs import token_inputs
 from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
 from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import (FastIncrementalDetokenizer,
@@ -212,7 +212,7 @@ def test_oov_decode(tokenizer, fast):
 
 @pytest.fixture
 def detokenizer(tokenizer_name: str) -> Detokenizer:
-    init_kwargs = dict(
+    tokenizer_group = TokenizerGroup(
         tokenizer_id=tokenizer_name,
         enable_lora=False,
         max_num_seqs=100,
@@ -222,11 +222,6 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
         revision=None,
     )
 
-    tokenizer_group = get_tokenizer_group(
-        None,
-        **init_kwargs,
-    )
-
     return Detokenizer(tokenizer_group)
 
 
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 5b62f992c1be..bcfa78ed41cf 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -1,40 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import asyncio
-import os
-import sys
-from typing import Optional
-from unittest.mock import patch
-
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-from vllm.transformers_utils.tokenizer_group import (TokenizerGroup,
-                                                     get_tokenizer_group)
-from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
-    RayTokenizerGroupPool)
-
-from ..conftest import get_tokenizer_pool_config
-
-
-class CustomTokenizerGroup(TokenizerGroup):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._i = 0
-
-    def encode(self, *args, **kwargs):
-        self._i += 1
-        return super().encode(*args, **kwargs)
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type",
-                         [None, "ray", CustomTokenizerGroup])
-async def test_tokenizer_group(tokenizer_group_type):
+async def test_tokenizer_group():
     reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    tokenizer_group = get_tokenizer_group(
-        get_tokenizer_pool_config(tokenizer_group_type),
+    tokenizer_group = TokenizerGroup(
         tokenizer_id="gpt2",
         enable_lora=False,
         max_num_seqs=1,
@@ -49,159 +24,3 @@ async def test_tokenizer_group(tokenizer_group_type):
                       PreTrainedTokenizerBase)
     assert tokenizer_group.get_lora_tokenizer(
         None) == await tokenizer_group.get_lora_tokenizer_async(None)
-    if tokenizer_group_type is CustomTokenizerGroup:
-        assert tokenizer_group._i > 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
-async def test_tokenizer_group_pool(tokenizer_group_type):
-    reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    tokenizer_group_pool = get_tokenizer_group(
-        get_tokenizer_pool_config(tokenizer_group_type),
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=None,
-    )
-    # Send multiple requests to the tokenizer group pool
-    # (more than the pool size)
-    # and check that all requests are processed correctly.
-    num_requests = tokenizer_group_pool.pool_size * 5
-    requests = [
-        tokenizer_group_pool.encode_async(prompt=f"prompt {i}",
-                                          lora_request=None)
-        for i in range(num_requests)
-    ]
-    results = await asyncio.gather(*requests)
-    expected_results = [
-        reference_tokenizer.encode(f"prompt {i}") for i in range(num_requests)
-    ]
-    assert results == expected_results
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
-async def test_tokenizer_group_ray_pool_env_var_propagation(
-        tokenizer_group_type):
-    """Test that env vars from caller process are propagated to
-    tokenizer Ray actors."""
-    env_var = "MY_ENV_VAR"
-
-    class EnvVarCheckerTokenizerGroup(TokenizerGroup):
-
-        def ping(self):
-            assert os.environ.get(env_var) == "1"
-            return super().ping()
-
-    class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool):
-        _worker_cls = EnvVarCheckerTokenizerGroup
-
-    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
-    tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
-        tokenizer_pool_config,
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=None)
-    with pytest.raises(AssertionError):
-        tokenizer_pool.ping()
-
-    with patch.dict(os.environ, {env_var: "1"}):
-        tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
-        tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config(
-            tokenizer_pool_config,
-            tokenizer_id="gpt2",
-            enable_lora=False,
-            max_num_seqs=1,
-            max_input_length=None)
-        tokenizer_pool.ping()
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
-async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
-    """Test that Ray tokenizer pool group can recover from failures and
-    if that's not possible, mark itself as unhealthy."""
-
-    class FailingTokenizerGroup(TokenizerGroup):
-
-        def __init__(self,
-                     *args,
-                     fail_at: Optional[list[int]] = None,
-                     **kwargs):
-            super().__init__(*args, **kwargs)
-            self.i = 0
-            self.fail_at = fail_at or []
-
-        def encode(self, *args, **kwargs):
-            self.i += 1
-            if self.i in self.fail_at:
-                sys.exit(1)
-            return super().encode(*args, **kwargs)
-
-    class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
-        _worker_cls = FailingTokenizerGroup
-
-    # Fail at first iteration
-    fail_at = [1]
-    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
-    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
-        tokenizer_pool_config,
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=None,
-        fail_at=fail_at)
-    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
-
-    # Modify fail at to not fail at all (will be re-read when actor is
-    # re-initialized).
-    fail_at[0] = 1000
-
-    # We should recover successfully.
-    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
-    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
-
-    # Check that we have a new actor
-    assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
-    assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors
-
-    # Fail at first iteration
-    fail_at = [1]
-    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
-        tokenizer_pool_config,
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=None,
-        fail_at=fail_at)
-
-    # We should fail after re-initialization.
-    with pytest.raises(RuntimeError):
-        await tokenizer_group_pool.encode_async(prompt="prompt",
-                                                lora_request=None)
-
-    # check_health should raise the same thing
-    with pytest.raises(RuntimeError):
-        tokenizer_group_pool.check_health()
-
-    # Ensure that non-ActorDiedErrors are still propagated correctly and do not
-    # cause a re-initialization.
-    fail_at = []
-    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
-        tokenizer_pool_config,
-        tokenizer_id="gpt2",
-        enable_lora=False,
-        max_num_seqs=1,
-        max_input_length=2,
-        fail_at=fail_at)
-    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
-
-    # Prompt too long error
-    with pytest.raises(ValueError):
-        await tokenizer_group_pool.encode_async(prompt="prompt" * 100,
-                                                lora_request=None)
-    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
-    # Actors should stay the same.
-    assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index 8872f0388dd2..f8addd920d57 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -47,7 +47,7 @@ def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
         tokenizer=tokenizer,
         tokenizer_group=init_tokenizer_from_configs(
             vllm_config.model_config, vllm_config.scheduler_config,
-            vllm_config.parallel_config, vllm_config.lora_config),
+            vllm_config.lora_config),
         vllm_config=vllm_config,
         full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
         prompt_tokens=prompt_tokens,
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 1ee93c72cd26..4a23e0c1b212 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -8,8 +8,7 @@
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.engine.arg_utils import EngineArgs
-from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-    BaseTokenizerGroup)
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, FinishReason
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
 
@@ -296,7 +295,7 @@ def generate_dummy_prompt_logprobs_tensors(
 class DummyOutputProcessorTestVectors:
     """Dummy test vectors for output processor tests"""
     tokenizer: GeneralTokenizerType
-    tokenizer_group: BaseTokenizerGroup
+    tokenizer_group: TokenizerGroup
     vllm_config: EngineArgs
     full_tokens: list[list[int]]  # Prompt + generated tokens
     prompt_tokens: list[list[int]]
diff --git a/vllm/config.py b/vllm/config.py
index 741ce04d5dff..12e6747c79ff 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -52,8 +52,6 @@
     from vllm.model_executor.layers.quantization.base_config import (
         QuantizationConfig)
     from vllm.model_executor.model_loader.loader import BaseModelLoader
-    from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
-        BaseTokenizerGroup)
 
     ConfigType = type[DataclassInstance]
 else:
@@ -1407,83 +1405,33 @@ def verify_with_parallel_config(
             logger.warning("Possibly too large swap space. %s", msg)
 
 
-PoolType = Literal["ray"]
-
-
 @config
 @dataclass
 class TokenizerPoolConfig:
-    """Configuration for the tokenizer pool."""
+    """This config is deprecated and will be removed in a future release.
 
-    pool_size: int = 0
-    """Number of tokenizer workers in the pool to use for asynchronous
-    tokenization. If 0, will use synchronous tokenization."""
-
-    pool_type: Union[PoolType, type["BaseTokenizerGroup"]] = "ray"
-    """Type of tokenizer pool to use for asynchronous tokenization. Ignored if
-    tokenizer_pool_size is 0."""
+    Passing these parameters will have no effect. Please remove them from your
+    configurations.
+    """
 
+    pool_size: int = 0
+    """This parameter is deprecated and will be removed in a future release.
+    Passing this parameter will have no effect. Please remove it from your
+    configurations."""
+    pool_type: str = "ray"
+    """This parameter is deprecated and will be removed in a future release.
+    Passing this parameter will have no effect. Please remove it from your
+    configurations."""
     extra_config: dict = field(default_factory=dict)
-    """Additional config for the pool. The way the config will be used depends
-    on the pool type. This should be a JSON string that will be parsed into a
-    dictionary. Ignored if tokenizer_pool_size is 0."""
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
+    """This parameter is deprecated and will be removed in a future release.
+    Passing this parameter will have no effect. Please remove it from your
+    configurations."""
 
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self):
-        if self.pool_type not in ("ray", ) and not isinstance(
-                self.pool_type, type):
-            raise ValueError(f"Unknown pool type: {self.pool_type}")
-        if not isinstance(self.extra_config, dict):
-            raise ValueError("extra_config must be a dictionary.")
-
-    @classmethod
-    def create_config(
-        cls, tokenizer_pool_size: int,
-        tokenizer_pool_type: Union[PoolType, type["BaseTokenizerGroup"]],
-        tokenizer_pool_extra_config: Optional[Union[str, dict]]
-    ) -> Optional["TokenizerPoolConfig"]:
-        """Create a TokenizerPoolConfig from the given parameters.
-
-        If tokenizer_pool_size is 0, return None.
-
-        Args:
-            tokenizer_pool_size: Number of tokenizer workers in the pool.
-            tokenizer_pool_type: Type of the pool.
-            tokenizer_pool_extra_config: Additional config for the pool.
-                The way the config will be used depends on the
-                pool type. This can be a JSON string (will be parsed).
-        """
-        if tokenizer_pool_size:
-            if isinstance(tokenizer_pool_extra_config, str):
-                tokenizer_pool_extra_config_parsed = json.loads(
-                    tokenizer_pool_extra_config)
-            else:
-                tokenizer_pool_extra_config_parsed = (
-                    tokenizer_pool_extra_config or {})
-            tokenizer_pool_config = cls(tokenizer_pool_size,
-                                        tokenizer_pool_type,
-                                        tokenizer_pool_extra_config_parsed)
-        else:
-            tokenizer_pool_config = None
-        return tokenizer_pool_config
+    def __post_init__(self) -> None:
+        logger.warning_once(
+            "TokenizerPoolConfig is deprecated and will be removed in a "
+            "future release. Passing this parameter will have no effect. "
+            "Please remove it from your configurations.")
 
 
 class LoadFormat(str, enum.Enum):
@@ -1624,8 +1572,8 @@ def data_parallel_rank_local(self, value: int) -> None:
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
     tokenizer_pool_config: Optional[TokenizerPoolConfig] = None
-    """Config for the tokenizer pool. If None, will use synchronous
-    tokenization."""
+    """This parameter is deprecated and will be removed in a future release.
+    Please remove it from your configs"""
 
     ray_workers_use_nsight: bool = False
     """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
@@ -2544,7 +2492,6 @@ def create_draft_parallel_config(
             max_parallel_loading_workers,
             disable_custom_all_reduce=target_parallel_config.
             disable_custom_all_reduce,
-            tokenizer_pool_config=target_parallel_config.tokenizer_pool_config,
             ray_workers_use_nsight=target_parallel_config.
             ray_workers_use_nsight,
             placement_group=target_parallel_config.placement_group,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 00328f56b713..6d6b5ac02b14 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,9 +7,8 @@
 import re
 import threading
 from dataclasses import MISSING, dataclass, fields
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Literal,
-                    Optional, Tuple, Type, TypeVar, Union, cast, get_args,
-                    get_origin)
+from typing import (Any, Callable, Dict, List, Literal, Optional, Tuple, Type,
+                    TypeVar, Union, cast, get_args, get_origin)
 
 import torch
 from typing_extensions import TypeIs
@@ -23,7 +22,7 @@
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
                          ModelConfig, ModelImpl, MultiModalConfig,
                          ObservabilityConfig, ParallelConfig, PoolerConfig,
-                         PoolType, PrefixCachingHashAlgo, PromptAdapterConfig,
+                         PrefixCachingHashAlgo, PromptAdapterConfig,
                          SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
                          TaskOption, TokenizerPoolConfig, VllmConfig,
                          get_attr_docs, get_field)
@@ -39,9 +38,6 @@
 
 # yapf: enable
 
-if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-
 logger = init_logger(__name__)
 
 ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"]
@@ -185,13 +181,12 @@ class EngineArgs:
     enforce_eager: Optional[bool] = None
     max_seq_len_to_capture: int = 8192
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
+    # The following three fields are deprecated and will be removed in a future
+    # release. Setting them will have no effect. Please remove them from your
+    # configurations.
     tokenizer_pool_size: int = TokenizerPoolConfig.pool_size
-    # Note: Specifying a tokenizer pool by passing a class
-    # is intended for expert use only. The API may change without
-    # notice.
-    tokenizer_pool_type: Union[PoolType, Type["BaseTokenizerGroup"]] = \
-        TokenizerPoolConfig.pool_type
-    tokenizer_pool_extra_config: dict[str, Any] = \
+    tokenizer_pool_type: str = TokenizerPoolConfig.pool_type
+    tokenizer_pool_extra_config: dict = \
         get_field(TokenizerPoolConfig, "extra_config")
     limit_mm_per_prompt: dict[str, int] = \
         get_field(MultiModalConfig, "limit_per_prompt")
@@ -1187,11 +1182,6 @@ def create_engine_config(
             enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
-            tokenizer_pool_config=TokenizerPoolConfig.create_config(
-                self.tokenizer_pool_size,
-                self.tokenizer_pool_type,
-                self.tokenizer_pool_extra_config,
-            ),
             ray_workers_use_nsight=self.ray_workers_use_nsight,
             placement_group=placement_group,
             distributed_executor_backend=self.distributed_executor_backend,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 67c7e109c9f0..ca8fd83314ae 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -526,8 +526,6 @@ async def add_request_async(
         )
 
     async def check_health_async(self) -> None:
-        if self.tokenizer:
-            self.tokenizer.check_health()
         self.model_executor.check_health()
 
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4644053785f1..276891489836 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -55,7 +55,7 @@
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import (
-    BaseTokenizerGroup, init_tokenizer_from_configs)
+    TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
 from vllm.utils import (Counter, Device, deprecate_kwargs,
@@ -66,7 +66,6 @@
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
-_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
 _R = TypeVar("_R", default=Any)
 
@@ -205,7 +204,7 @@ def validate_outputs(
 
         return outputs_
 
-    tokenizer: Optional[BaseTokenizerGroup]
+    tokenizer: Optional[TokenizerGroup]
 
     def __init__(
         self,
@@ -321,11 +320,6 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
                     self.parallel_config.disable_custom_all_reduce,
                 })
 
-        if self.tokenizer:
-            # Ping the tokenizer to ensure liveness if it runs in a
-            # different process.
-            self.tokenizer.ping()
-
         self.cached_scheduler_outputs = [
             SchedulerOutputState()
             for _ in range(self.parallel_config.pipeline_parallel_size)
@@ -537,21 +531,12 @@ def __del__(self):
         if model_executor := getattr(self, "model_executor", None):
             model_executor.shutdown()
 
-    def get_tokenizer_group(
-        self,
-        group_type: Type[_G] = BaseTokenizerGroup,
-    ) -> _G:
-        tokenizer_group = self.tokenizer
-
-        if tokenizer_group is None:
+    def get_tokenizer_group(self) -> TokenizerGroup:
+        if self.tokenizer is None:
             raise ValueError("Unable to get tokenizer because "
                              "skip_tokenizer_init is True")
-        if not isinstance(tokenizer_group, group_type):
-            raise TypeError("Invalid type of tokenizer group. "
-                            f"Expected type: {group_type}, but "
-                            f"found type: {type(tokenizer_group)}")
 
-        return tokenizer_group
+        return self.tokenizer
 
     def get_tokenizer(
         self,
@@ -559,11 +544,10 @@ def get_tokenizer(
     ) -> AnyTokenizer:
         return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
 
-    def _init_tokenizer(self) -> BaseTokenizerGroup:
+    def _init_tokenizer(self) -> TokenizerGroup:
         return init_tokenizer_from_configs(
             model_config=self.model_config,
             scheduler_config=self.scheduler_config,
-            parallel_config=self.parallel_config,
             lora_config=self.lora_config)
 
     def _verify_args(self) -> None:
@@ -1952,8 +1936,6 @@ def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
 
     def check_health(self) -> None:
-        if self.tokenizer:
-            self.tokenizer.check_health()
         self.model_executor.check_health()
 
     def is_tracing_enabled(self) -> bool:
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 6e56cbdbbf8c..eb3ae89394ec 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -101,7 +101,6 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
         self.tokenizer = init_tokenizer_from_configs(
             model_config=self.model_config,
             scheduler_config=engine_config.scheduler_config,
-            parallel_config=engine_config.parallel_config,
             lora_config=engine_config.lora_config)
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 57c7ab73de37..38a541a408fa 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -40,7 +40,6 @@
                                   RequestOutputKind, SamplingParams)
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
-from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
                         is_list_of)
@@ -253,10 +252,10 @@ def __init__(
         self.default_sampling_params: Union[dict[str, Any], None] = None
 
     def get_tokenizer(self) -> AnyTokenizer:
-        return self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer
+        return self.llm_engine.get_tokenizer_group().tokenizer
 
     def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
-        tokenizer_group = self.llm_engine.get_tokenizer_group(TokenizerGroup)
+        tokenizer_group = self.llm_engine.get_tokenizer_group()
 
         # While CachedTokenizer is dynamic, have no choice but
         # compare class name. Misjudgment will arise from
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 669fb96e6653..a4609290a900 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -13,7 +13,7 @@
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                     MultiModalInputs)
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs,
                    PromptType, SingletonInputs, SingletonPrompt, token_inputs)
@@ -27,7 +27,7 @@ class InputPreprocessor:
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: Optional[BaseTokenizerGroup],
+        tokenizer: Optional[TokenizerGroup],
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         super().__init__()
@@ -36,7 +36,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
 
-    def get_tokenizer_group(self) -> BaseTokenizerGroup:
+    def get_tokenizer_group(self) -> TokenizerGroup:
         if self.tokenizer is None:
             raise ValueError("You cannot pass text prompts when "
                              "`skip_tokenizer_init` is True")
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 9d1d4bb92e4a..991d5631e64e 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -8,13 +8,13 @@
 from .detokenizer_utils import (convert_prompt_ids_to_tokens,
                                 detokenize_incrementally)
 from .tokenizer import AnyTokenizer
-from .tokenizer_group import BaseTokenizerGroup
+from .tokenizer_group import TokenizerGroup
 
 
 class Detokenizer:
     """Provides methods to decode the output of a model into text."""
 
-    def __init__(self, tokenizer_group: BaseTokenizerGroup):
+    def __init__(self, tokenizer_group: TokenizerGroup):
         self.tokenizer_group = tokenizer_group
 
     def get_tokenizer_for_seq(self, sequence: Sequence) -> AnyTokenizer:
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
similarity index 84%
rename from vllm/transformers_utils/tokenizer_group/tokenizer_group.py
rename to vllm/transformers_utils/tokenizer_group.py
index b6e9005bcd24..a829985cb459 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -2,7 +2,7 @@
 
 from typing import List, Optional
 
-from vllm.config import TokenizerPoolConfig
+from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
                                                get_lora_tokenizer,
@@ -10,10 +10,8 @@
                                                get_tokenizer)
 from vllm.utils import LRUCache
 
-from .base_tokenizer_group import BaseTokenizerGroup
 
-
-class TokenizerGroup(BaseTokenizerGroup):
+class TokenizerGroup:
     """A group of tokenizers that can be used for LoRA adapters."""
 
     def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
@@ -27,15 +25,6 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.lora_tokenizers = LRUCache[int, AnyTokenizer](
             capacity=max(max_loras, max_num_seqs) if enable_lora else 0)
 
-    @classmethod
-    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
-                    **init_kwargs) -> "TokenizerGroup":
-        return cls(**init_kwargs)
-
-    def ping(self) -> bool:
-        """Check if the tokenizer group is alive."""
-        return True
-
     def get_max_input_len(self,
                           lora_request: Optional[LoRARequest] = None
                           ) -> Optional[int]:
@@ -104,3 +93,18 @@ async def get_lora_tokenizer_async(
             return tokenizer
         else:
             return self.lora_tokenizers[lora_request.lora_int_id]
+
+
+def init_tokenizer_from_configs(model_config: ModelConfig,
+                                scheduler_config: SchedulerConfig,
+                                lora_config: Optional[LoRAConfig]):
+    return TokenizerGroup(
+        tokenizer_id=model_config.tokenizer,
+        enable_lora=bool(lora_config),
+        max_num_seqs=scheduler_config.max_num_seqs,
+        max_loras=lora_config.max_loras if lora_config else 0,
+        max_input_length=None,
+        tokenizer_mode=model_config.tokenizer_mode,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.tokenizer_revision,
+        truncation_side=model_config.truncation_side)
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
deleted file mode 100644
index 9d2209575bd3..000000000000
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional, Type
-
-from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, TokenizerPoolConfig)
-from vllm.executor.ray_utils import ray
-
-from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup
-from .tokenizer_group import TokenizerGroup
-
-if ray:
-    from .ray_tokenizer_group import RayTokenizerGroupPool
-else:
-    RayTokenizerGroupPool = None  # type: ignore
-
-
-def init_tokenizer_from_configs(model_config: ModelConfig,
-                                scheduler_config: SchedulerConfig,
-                                parallel_config: ParallelConfig,
-                                lora_config: Optional[LoRAConfig]):
-    init_kwargs = dict(tokenizer_id=model_config.tokenizer,
-                       enable_lora=bool(lora_config),
-                       max_num_seqs=scheduler_config.max_num_seqs,
-                       max_loras=lora_config.max_loras if lora_config else 0,
-                       max_input_length=None,
-                       tokenizer_mode=model_config.tokenizer_mode,
-                       trust_remote_code=model_config.trust_remote_code,
-                       revision=model_config.tokenizer_revision,
-                       truncation_side=model_config.truncation_side)
-
-    return get_tokenizer_group(parallel_config.tokenizer_pool_config,
-                               **init_kwargs)
-
-
-def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
-                        **init_kwargs) -> BaseTokenizerGroup:
-    tokenizer_cls: Type[BaseTokenizerGroup]
-    if tokenizer_pool_config is None:
-        tokenizer_cls = TokenizerGroup
-    elif isinstance(tokenizer_pool_config.pool_type, type) and issubclass(
-            tokenizer_pool_config.pool_type, BaseTokenizerGroup):
-        tokenizer_cls = tokenizer_pool_config.pool_type
-    elif tokenizer_pool_config.pool_type == "ray":
-        if RayTokenizerGroupPool is None:
-            raise ImportError(
-                "RayTokenizerGroupPool is not available. Please install "
-                "the ray package to use the Ray tokenizer group pool.")
-        tokenizer_cls = RayTokenizerGroupPool
-    else:
-        raise ValueError(
-            f"Unknown pool type: {tokenizer_pool_config.pool_type}")
-    return tokenizer_cls.from_config(tokenizer_pool_config, **init_kwargs)
-
-
-__all__ = ["AnyTokenizer", "get_tokenizer_group", "BaseTokenizerGroup"]
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
deleted file mode 100644
index c5108a7fc6eb..000000000000
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from abc import ABC, abstractmethod
-from typing import List, Optional
-
-from vllm.config import TokenizerPoolConfig
-from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-
-class BaseTokenizerGroup(ABC):
-    """A group of tokenizers that can be used for LoRA adapters."""
-
-    @classmethod
-    @abstractmethod
-    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
-                    **init_kwargs) -> "BaseTokenizerGroup":
-        pass
-
-    @abstractmethod
-    def ping(self) -> bool:
-        """Check if the tokenizer group is alive."""
-        pass
-
-    @abstractmethod
-    def get_max_input_len(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> Optional[int]:
-        """Get the maximum input length for the LoRA request."""
-        pass
-
-    @abstractmethod
-    def encode(self,
-               prompt: str,
-               lora_request: Optional[LoRARequest] = None,
-               add_special_tokens: Optional[bool] = None) -> List[int]:
-        """Encode a prompt using the tokenizer group."""
-        pass
-
-    @abstractmethod
-    async def encode_async(
-            self,
-            prompt: str,
-            lora_request: Optional[LoRARequest] = None,
-            add_special_tokens: Optional[bool] = None) -> List[int]:
-        """Encode a prompt using the tokenizer group."""
-        pass
-
-    @abstractmethod
-    def get_lora_tokenizer(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        """Get a tokenizer for a LoRA request."""
-        pass
-
-    @abstractmethod
-    async def get_lora_tokenizer_async(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        """Get a tokenizer for a LoRA request."""
-        pass
-
-    def check_health(self):
-        """Raise exception if the tokenizer group is unhealthy."""
-        return
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
deleted file mode 100644
index b048b8094174..000000000000
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import asyncio
-import os
-from typing import List, Optional
-
-try:
-    from ray.exceptions import ActorDiedError  # type: ignore
-except ImportError:
-    # For older versions of Ray
-    from ray.exceptions import RayActorError as ActorDiedError  # type: ignore
-from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
-
-from vllm.config import TokenizerPoolConfig
-from vllm.executor.ray_utils import ray
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-from .base_tokenizer_group import BaseTokenizerGroup
-from .tokenizer_group import TokenizerGroup
-
-logger = init_logger(__name__)
-
-
-class RayTokenizerGroupPool(BaseTokenizerGroup):
-    """A Ray-based pool of TokenizerGroups for async tokenization."""
-
-    # Class to use for workers making up the pool.
-    _worker_cls = TokenizerGroup
-
-    @classmethod
-    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig],
-                    **init_kwargs) -> "RayTokenizerGroupPool":
-        if not tokenizer_pool_config:
-            raise ValueError("tokenizer_pool_config must not be None.")
-        ray_actor_options = (tokenizer_pool_config.extra_config or {
-            "num_cpus": 0
-        })
-        ray_actor_options.setdefault(
-            "scheduling_strategy",
-            NodeAffinitySchedulingStrategy(
-                node_id=ray.get_runtime_context().get_node_id(), soft=True))
-
-        # Carry over the env vars to the actors.
-        # This is necessary for API keys and such.
-        ray_actor_options.setdefault("runtime_env", {})
-        _carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
-
-        init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
-        init_kwargs["ray_actor_options"] = ray_actor_options
-
-        return cls(**init_kwargs)
-
-    def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
-                 max_input_length: Optional[int], num_actors: int,
-                 ray_actor_options: dict, **tokenizer_config):
-        # Store a local copy of the TokenizerGroup for quick access
-        # to underlying HF tokenizers.
-        self._tokenizer_config = {
-            "tokenizer_id": tokenizer_id,
-            "enable_lora": enable_lora,
-            "max_num_seqs": max_num_seqs,
-            "max_input_length": max_input_length,
-            **tokenizer_config
-        }
-        self._local_tokenizer_group = self._worker_cls(
-            **self._tokenizer_config, )
-
-        self._ray_tokenizer_group_cls = ray.remote(
-            self._worker_cls).options(**ray_actor_options)  # type: ignore
-        self.tokenizer_actors = [self._init_actor() for _ in range(num_actors)]
-        self._idle_actors: Optional[asyncio.Queue] = None
-
-        # If set, actor is unhealthy. Will reraise on the next
-        # check_health call.
-        self._exception: Optional[ActorDiedError] = None
-
-    def _init_actor(self) -> ray.ObjectRef:
-        return self._ray_tokenizer_group_cls.remote(**self._tokenizer_config)
-
-    @property
-    def pool_size(self) -> int:
-        return len(self.tokenizer_actors)
-
-    def ping(self):
-        return ray.get([
-            actor.ping.remote()  # type: ignore
-            for actor in self.tokenizer_actors
-        ])
-
-    def _ensure_queue_initialized(self):
-        if self._idle_actors is None:
-            self._idle_actors = asyncio.Queue()
-            for actor in self.tokenizer_actors:
-                self._idle_actors.put_nowait(actor)
-
-    def _finalize_encode(self, actor: ray.ObjectRef,
-                         original_actor: ray.ObjectRef, actor_is_alive: bool):
-        assert self._idle_actors is not None
-        # Cleanup the dead actor.
-        if not actor_is_alive or original_actor is not actor:
-            self.tokenizer_actors.remove(original_actor)
-        if actor_is_alive:
-            # Put the actor back in the queue.
-            # This is done in a finally block to ensure that the actor is
-            # always put back in the queue, even if an exception/cancellation
-            # is raised.
-            self._idle_actors.put_nowait(actor)
-            # Add back the new actor.
-            if original_actor is not actor:
-                self.tokenizer_actors.append(actor)
-
-    def encode(self,
-               prompt: str,
-               lora_request: Optional[LoRARequest] = None,
-               add_special_tokens: Optional[bool] = None) -> List[int]:
-        """Encode a prompt using the tokenizer group.
-
-        We pick an idle actor and use it to encode the prompt.
-        The actor is then put back in the queue for future use.
-        This is blocking.
-        """
-        self.check_health()
-        self._ensure_queue_initialized()
-        assert self._idle_actors is not None
-
-        if self._idle_actors.empty():
-            raise RuntimeError("No idle actors available.")
-        actor = self._idle_actors.get_nowait()
-        actor_is_alive = True
-        original_actor = actor
-        try:
-            ret = ray.get(
-                actor.encode.remote(prompt=prompt,
-                                    lora_request=lora_request,
-                                    add_special_tokens=add_special_tokens))
-        except ActorDiedError as e:
-            # If the actor is dead, we first try to reinitialize it.
-            logger.warning("%s died with ActorDiedError, reinitializing.",
-                           actor,
-                           exc_info=e)
-            actor = self._init_actor()
-            try:
-                ret = ray.get(
-                    actor.encode.remote(prompt=prompt,
-                                        lora_request=lora_request,
-                                        add_special_tokens=add_special_tokens))
-            except ActorDiedError as e:
-                logger.error(
-                    "%s died for second time in a row, marking "
-                    "RayTokenizerGroupPool as unhealthy.", actor)
-                actor_is_alive = False
-                if not self._exception:
-                    self._exception = e
-                self.check_health()
-        finally:
-            self._finalize_encode(actor, original_actor, actor_is_alive)
-        return ret
-
-    async def encode_async(
-            self,
-            prompt: str,
-            lora_request: Optional[LoRARequest] = None,
-            add_special_tokens: Optional[bool] = None) -> List[int]:
-        """Encode a prompt using the tokenizer group.
-
-        We pick an idle actor and use it to encode the prompt.
-        If there are no idle actors, we wait until one becomes
-        available.
-        The actor is then put back in the queue for future use.
-        This is non-blocking.
-        """
-        self.check_health()
-        self._ensure_queue_initialized()
-        assert self._idle_actors is not None
-
-        actor = await self._idle_actors.get()
-        actor_is_alive = True
-        original_actor = actor
-        try:
-            ret = await actor.encode.remote(
-                prompt=prompt,
-                lora_request=lora_request,
-                add_special_tokens=add_special_tokens)
-        except ActorDiedError as e:
-            # If the actor is dead, we first try to reinitialize it.
-            logger.warning("%s died with ActorDiedError, reinitializing.",
-                           actor,
-                           exc_info=e)
-            actor = self._init_actor()
-            try:
-                ret = await actor.encode.remote(
-                    prompt=prompt,
-                    lora_request=lora_request,
-                    add_special_tokens=add_special_tokens)
-            except ActorDiedError as e:
-                logger.error(
-                    "%s died for second time in a row, marking "
-                    "RayTokenizerGroupPool as unhealthy.", actor)
-                actor_is_alive = False
-                if not self._exception:
-                    self._exception = e
-                self.check_health()
-        finally:
-            self._finalize_encode(actor, original_actor, actor_is_alive)
-        return ret
-
-    def get_max_input_len(self,
-                          lora_request: Optional[LoRARequest] = None
-                          ) -> Optional[int]:
-        """Get the maximum input length for the LoRA request."""
-        return self._local_tokenizer_group.get_max_input_len(lora_request)
-
-    def get_lora_tokenizer(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
-
-    async def get_lora_tokenizer_async(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        return await self._local_tokenizer_group.get_lora_tokenizer_async(
-            lora_request)
-
-    def check_health(self):
-        if self._exception:
-            raise RuntimeError(
-                "TokenizerGroupPool is unhealthy.") from self._exception
-
-
-def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
-    """Copy over all current process environment variables to the runtime_env.
-
-    The variables in runtime_env will take precedence over the current process
-    environment variables.
-
-    runtime_env will be modified in place."""
-    env_vars = os.environ.copy()
-    runtime_env.setdefault("env_vars", {})
-    env_vars.update(runtime_env["env_vars"])
-    runtime_env["env_vars"] = env_vars
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1149dfa9ce5c..fdda812c2853 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -81,9 +81,7 @@ def __init__(
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)
-        self.tokenizer.ping()
 
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 6fa90b269825..af67408097ab 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -20,7 +20,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import (
-    BaseTokenizerGroup, init_tokenizer_from_configs)
+    TokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device
 from vllm.v1.engine.core_client import EngineCoreClient
@@ -32,7 +32,6 @@
 
 logger = init_logger(__name__)
 
-_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _R = TypeVar("_R", default=Any)
 
 
@@ -74,9 +73,7 @@ def __init__(
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)
-        self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
         self.processor = Processor(vllm_config=vllm_config,
@@ -258,21 +255,12 @@ def wake_up(self, tags: Optional[list[str]] = None):
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
 
-    def get_tokenizer_group(
-        self,
-        group_type: type[_G] = BaseTokenizerGroup,
-    ) -> _G:
-        tokenizer_group = self.tokenizer
-
-        if tokenizer_group is None:
+    def get_tokenizer_group(self) -> TokenizerGroup:
+        if self.tokenizer is None:
             raise ValueError("Unable to get tokenizer because "
                              "skip_tokenizer_init is True")
-        if not isinstance(tokenizer_group, group_type):
-            raise TypeError("Invalid type of tokenizer group. "
-                            f"Expected type: {group_type}, but "
-                            f"found type: {type(tokenizer_group)}")
 
-        return tokenizer_group
+        return self.tokenizer
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index d652b17e55b3..9c42d34b8a9a 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -8,7 +8,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 from vllm.v1.engine.logprobs import LogprobsProcessor
@@ -225,7 +225,7 @@ class OutputProcessor:
 
     def __init__(
         self,
-        tokenizer: BaseTokenizerGroup,
+        tokenizer: TokenizerGroup,
         log_stats: bool,
     ):
         self.log_stats = log_stats
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7e6b7ba47035..af09a67635a2 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -17,7 +17,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MirroredProcessingCache
 from vllm.v1.structured_output.backend_guidance import (
@@ -31,7 +31,7 @@ class Processor:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        tokenizer: BaseTokenizerGroup,
+        tokenizer: TokenizerGroup,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 0edb15558dce..a59ec5efc53e 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -61,9 +61,7 @@ def __init__(self, vllm_config: VllmConfig):
         tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
-        tokenizer_group.ping()
         self.vllm_config = vllm_config
         self.vocab_size = vllm_config.model_config.get_vocab_size()
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 1e4470153e30..0b5a1593b3eb 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -35,9 +35,7 @@ def __init__(self, vllm_config: VllmConfig):
         tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
             lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
-        tokenizer_group.ping()
 
         self.disable_any_whitespace = False
         backend_options = GuidedDecodingParams(

From a9138e85b14047e06300685b48e3485b995425fb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 24 Apr 2025 12:44:12 +0100
Subject: [PATCH 42/81] Fix OOT registration test (#17099)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/models/test_oot_registration.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index f1ed8a04cfa0..b45a87d94b86 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -18,10 +18,9 @@ def test_plugin(
         m.setenv("VLLM_USE_V1", "0")
         m.setenv("VLLM_PLUGINS", "")
 
-        with pytest.raises(Exception) as excinfo:
+        match = "Cannot find model module"
+        with pytest.raises(ValueError, match=match):
             LLM(model=dummy_opt_path, load_format="dummy")
-        error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM"  # noqa: E501
-        assert (error_msg in str(excinfo.value))
 
 
 @create_new_process_for_each_test()

From c0dfd97519c8ff18b6f9c7d86010294ca3510edf Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Thu, 24 Apr 2025 05:27:08 -0700
Subject: [PATCH 43/81] [V1][PP] Optimization: continue scheduling prefill
 chunks (#17080)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 tests/v1/core/test_scheduler.py     |   4 --
 tests/v1/engine/test_engine_core.py | 106 ++++++++++++++++++++++------
 vllm/v1/core/sched/interface.py     |   5 --
 vllm/v1/core/sched/scheduler.py     |  67 +++++++++---------
 vllm/v1/engine/core.py              |  20 +++---
 5 files changed, 128 insertions(+), 74 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 560a60a81446..003f94259e2d 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -437,7 +437,6 @@ def test_stop_via_update_from_output():
         req.num_computed_tokens = req.num_tokens
         scheduler.requests[req.request_id] = req
         scheduler.running.append(req)
-        scheduler.scheduled_req_ids.add(req.request_id)
 
     scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                        scheduled_cached_reqs=[],
@@ -489,7 +488,6 @@ def test_stop_via_update_from_output():
         req.num_computed_tokens = req.num_tokens
         scheduler.requests[req.request_id] = req
         scheduler.running.append(req)
-        scheduler.scheduled_req_ids.add(req.request_id)
 
     scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                        scheduled_cached_reqs=[],
@@ -539,7 +537,6 @@ def test_stop_via_update_from_output():
         req.num_computed_tokens = req.num_tokens
         scheduler.requests[req.request_id] = req
         scheduler.running.append(req)
-        scheduler.scheduled_req_ids.add(req.request_id)
 
     scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
                                        scheduled_cached_reqs=[],
@@ -589,7 +586,6 @@ def test_stop_via_update_from_output():
     requests[0].num_computed_tokens = requests[0].num_tokens
     scheduler.requests[requests[0].request_id] = requests[0]
     scheduler.running.append(requests[0])
-    scheduler.scheduled_req_ids.add(requests[0].request_id)
 
     scheduler_output = SchedulerOutput(
         scheduled_new_reqs=[],
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 3f3109c1484c..047ff774cadc 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -1,10 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import copy
-import threading
 import time
 import uuid
-from concurrent.futures import Future
+from concurrent.futures import Future, ThreadPoolExecutor
 
 import pytest
 from transformers import AutoTokenizer
@@ -244,33 +243,33 @@ def initialize_from_config(
                 self, kv_cache_configs: list[KVCacheConfig]) -> None:
             super().initialize_from_config(kv_cache_configs)
 
-            # This executor actually can only run 1 batch at a time
-            self.semaphore = threading.Semaphore(1)
+            # Create a thread pool with a single worker
+            self.thread_pool = ThreadPoolExecutor(max_workers=1)
 
         def execute_model(
             self,
             scheduler_output,
         ) -> Future[ModelRunnerOutput]:
             """Make execute_model non-blocking."""
-            future: Future[ModelRunnerOutput] = Future()
 
-            def _thread_wrapper(scheduler_output, future):
-                with self.semaphore:
-                    output = self.collective_rpc("execute_model",
-                                                 args=(scheduler_output, ))
-                    # Make a copy because output[0] may be reused
-                    # by the next batch.
-                    output = copy.deepcopy(output[0])
-                    future.set_result(output)
+            def _execute():
+                output = self.collective_rpc("execute_model",
+                                             args=(scheduler_output, ))
+                # Make a copy because output[0] may be reused
+                # by the next batch.
+                return copy.deepcopy(output[0])
 
-            threading.Thread(target=_thread_wrapper,
-                             args=(scheduler_output, future)).start()
-            return future
+            # Use the thread pool instead of creating a new thread
+            return self.thread_pool.submit(_execute)
 
         @property
         def max_concurrent_batches(self) -> int:
             return 2
 
+        def shutdown(self):
+            if hasattr(self, 'thread_pool'):
+                self.thread_pool.shutdown(wait=False)
+
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -299,14 +298,77 @@ def max_concurrent_batches(self) -> int:
         # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue() is None
         assert engine_core.batch_queue.qsize() == 1
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 10
+        # num_computed_tokens should have been updated immediately.
+        assert engine_core.scheduler.requests[
+            req0.request_id].num_computed_tokens == 10
+
+        # Schedule Batch 2: (2, req0), (8, req1)
         assert engine_core.step_with_batch_queue() is None
         assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 2
+        assert scheduler_output.num_scheduled_tokens[1] == 8
+        # num_computed_tokens should have been updated immediately.
+        assert engine_core.scheduler.requests[0].num_computed_tokens == 12
+        assert engine_core.scheduler.requests[1].num_computed_tokens == 8
+
         assert engine_core.scheduler.get_num_unfinished_requests() == 2
 
-        # Loop through both requests.
-        while engine_core.scheduler.get_num_unfinished_requests() == 2:
-            engine_core.step_with_batch_queue()
+        # Batch queue is full. Finish Batch 1.
+        engine_core.step_with_batch_queue()
+
+        # Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled
+        # because it is in the decoding stage now.
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[1] == 4
 
-        # Reaching here when got the result of the first request.
-        while engine_core.scheduler.get_num_unfinished_requests() == 1:
-            engine_core.step_with_batch_queue()
+        # Batch queue is full. Finish Batch 2. Get first token of req0.
+        output = engine_core.step_with_batch_queue()
+        assert output is not None
+        assert len(output.outputs) == 1
+        assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
+
+        # Schedule Batch 4: (1, req0).
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[0] == 1
+
+        # Batch queue is full. Finish Batch 3. Get first token of req1.
+        output = engine_core.step_with_batch_queue()
+        assert output is not None
+        assert len(output.outputs) == 1
+        assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
+
+        # Schedule Batch 5: (1, req1).
+        engine_core.step_with_batch_queue()
+        assert engine_core.batch_queue.qsize() == 2
+        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert scheduler_output.num_scheduled_tokens[1] == 1
+
+        # Loop until req0 is finished.
+        step = 0
+        req_id = 0
+        expected_num_tokens = [
+            engine_core.scheduler.requests[0].num_tokens + 1,
+            engine_core.scheduler.requests[1].num_tokens + 1,
+        ]
+        while engine_core.scheduler.get_num_unfinished_requests() == 2:
+            output = engine_core.step_with_batch_queue()
+            if step % 2 == 0:
+                # Even steps consumes an output.
+                assert output is not None
+                assert len(output.outputs) == 1
+                if req_id in engine_core.scheduler.requests:
+                    assert engine_core.scheduler.requests[
+                        req_id].num_tokens == expected_num_tokens[req_id]
+                expected_num_tokens[req_id] += 1
+                req_id = (req_id + 1) % 2
+            else:
+                # Odd steps schedules a new batch.
+                assert output is None
+            step += 1
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
index bfed44f9d58c..1de236d42f02 100644
--- a/vllm/v1/core/sched/interface.py
+++ b/vllm/v1/core/sched/interface.py
@@ -117,11 +117,6 @@ def has_requests(self) -> bool:
         not yet returned in SchedulerOutputs."""
         return self.has_unfinished_requests() or self.has_finished_requests()
 
-    @abstractmethod
-    def get_num_unscheduled_requests(self) -> int:
-        """Number of requests that are not being processed by the executor."""
-        raise NotImplementedError
-
     @abstractmethod
     def reset_prefix_cache(self) -> bool:
         """Reset the prefix cache for KV cache.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 5adcdde5bcd7..5805950f39b0 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import time
-from collections import deque
+from collections import defaultdict, deque
 from collections.abc import Iterable
 from typing import Optional, Union
 
@@ -88,9 +88,6 @@ def __init__(
         # Priority queues for requests.
         self.waiting: deque[Request] = deque()
         self.running: list[Request] = []
-        # The requests that have been scheduled and are being executed
-        # by the executor.
-        self.scheduled_req_ids: set[str] = set()
 
         # The request IDs that are finished in between the previous and the
         # current steps. This is used to notify the workers about the finished
@@ -100,8 +97,9 @@ def __init__(
 
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
-        # Request id -> CachedRequestData
-        self._cached_reqs_data: dict[str, CachedRequestData] = {}
+        # Request id -> deque of CachedRequestData
+        self._cached_reqs_data: dict[
+            str, deque[CachedRequestData]] = defaultdict(deque)
 
         # Encoder-related.
         # Calculate encoder cache size if applicable
@@ -171,10 +169,6 @@ def schedule(self) -> SchedulerOutput:
         req_index = 0
         while req_index < len(self.running) and token_budget > 0:
             request = self.running[req_index]
-            if request.request_id in self.scheduled_req_ids:
-                # This request has already been scheduled.
-                req_index += 1
-                continue
 
             num_new_tokens = (request.num_tokens_with_spec -
                               request.num_computed_tokens)
@@ -183,33 +177,35 @@ def schedule(self) -> SchedulerOutput:
                 num_new_tokens = (
                     self.scheduler_config.long_prefill_token_threshold)
             num_new_tokens = min(num_new_tokens, token_budget)
-            assert num_new_tokens > 0
 
             # Make sure the input position does not exceed the max model len.
             # This is necessary when using spec decoding.
             num_new_tokens = min(
                 num_new_tokens,
                 self.max_model_len - request.num_computed_tokens)
-            assert num_new_tokens > 0
 
             # Schedule encoder inputs.
+            encoder_inputs_to_schedule = None
+            new_encoder_budget = encoder_budget
             if request.has_encoder_inputs:
                 (encoder_inputs_to_schedule, num_new_tokens,
                  new_encoder_budget) = self._try_schedule_encoder_inputs(
                      request, request.num_computed_tokens, num_new_tokens,
                      encoder_budget)
-                if num_new_tokens == 0:
-                    # The request cannot be scheduled because the encoder budget
-                    # or the encoder cache is exhausted.
-                    # NOTE(woosuk): By using `continue` instead of `break` here,
-                    # we intentionally relax the strict FCFS scheduling policy
-                    # to allow lower-priority requests to be scheduled when a
-                    # higher-priority request is blocked by encoder constraints.
-                    req_index += 1
-                    continue
-            else:
-                encoder_inputs_to_schedule = None
-                new_encoder_budget = encoder_budget
+
+            if num_new_tokens == 0:
+                # The request cannot be scheduled because one of the following
+                # reasons:
+                # 1. No new tokens to schedule. This may happen when PP>1 and
+                #    we have already scheduled all prompt tokens but they are
+                #    not finished yet.
+                # 2. The encoder budget is exhausted.
+                # 3. The encoder cache is exhausted.
+                # NOTE(woosuk): Here, by doing `continue` instead of `break`,
+                # we do not strictly follow the FCFS scheduling policy and
+                # allow the lower-priority requests to be scheduled.
+                req_index += 1
+                continue
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
@@ -243,7 +239,6 @@ def schedule(self) -> SchedulerOutput:
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
-            self.scheduled_req_ids.add(request.request_id)
             if request.use_structured_output:
                 # PERF: in case of chunked prefill,
                 # request might not include any new tokens.
@@ -382,7 +377,6 @@ def schedule(self) -> SchedulerOutput:
                         request.request_id] = req_index
                 req_index += 1
                 self.running.append(request)
-                self.scheduled_req_ids.add(request.request_id)
                 if self.log_stats:
                     request.record_event(EngineCoreEventType.SCHEDULED,
                                          scheduled_timestamp)
@@ -521,18 +515,21 @@ def _make_cached_request_data(
         num_regular_tokens = num_scheduled_tokens - num_scheduled_spec_tokens
         new_token_ids = request.all_token_ids[
             num_computed_tokens:num_computed_tokens + num_regular_tokens]
-        req_data = self._cached_reqs_data.get(request.request_id)
-        if req_data is not None:
+
+        req_data_queue = self._cached_reqs_data.get(request.request_id)
+        if req_data_queue:
+            req_data = req_data_queue.popleft()
             req_data.resumed_from_preemption = resumed_from_preemption
             req_data.new_token_ids = new_token_ids
             req_data.new_block_ids = new_block_ids
             req_data.num_computed_tokens = num_computed_tokens
         else:
+            # No cached request data, or all cached request data has been
+            # used by the scheduled requests.
             req_data = CachedRequestData.from_request(request,
                                                       resumed_from_preemption,
                                                       new_token_ids,
                                                       new_block_ids)
-            self._cached_reqs_data[request.request_id] = req_data
         return req_data
 
     def _try_schedule_encoder_inputs(
@@ -561,6 +558,8 @@ def _try_schedule_encoder_inputs(
         Note that num_computed_tokens includes both locally cached
         blocks and externally cached blocks (via KVConnector).
         """
+        if num_new_tokens == 0 or not request.has_encoder_inputs:
+            return [], num_new_tokens, encoder_budget
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
@@ -728,10 +727,13 @@ def update_from_output(
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
 
-            self.scheduled_req_ids.remove(req_id)
             if not stopped:
                 new_running.append(request)
 
+        # Return the cached request data to the queue so they can be reused.
+        for req_data in scheduler_output.scheduled_cached_reqs:
+            self._cached_reqs_data[req_data.req_id].append(req_data)
+
         self.running = new_running
         engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
@@ -774,7 +776,6 @@ def finish_requests(
 
             if request.status == RequestStatus.RUNNING:
                 self.running.remove(request)
-                self.scheduled_req_ids.discard(request.request_id)
             else:
                 self.waiting.remove(request)
             request.status = finished_status
@@ -795,10 +796,6 @@ def get_num_unfinished_requests(self) -> int:
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
 
-    def get_num_unscheduled_requests(self) -> int:
-        """Number of requests that are not being processed by the executor."""
-        return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
-
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 572e052cdcc2..65b97eb4a72e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -210,10 +210,10 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
         Note that if nothing to output in this step, None is returned.
 
         The execution flow is as follows:
-        1. Try to schedule a new batch if there are unscheduled requests
-        and the job queue is not full. If a new batch is scheduled, directly
-        return an empty engine core output. In other words, we won't check
-        and return model outputs before the batch queue is full.
+        1. Try to schedule a new batch if the batch queue is not full.
+        If a new batch is scheduled, directly return an empty engine core
+        output. In other words, fulfilling the batch queue has a higher priority
+        than getting model outputs.
         2. If there is no new scheduled batch, meaning that the batch queue
         is full or no other requests can be scheduled, we block until the first
         batch in the job queue is finished.
@@ -223,10 +223,10 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
 
         engine_core_outputs = None
         scheduler_output = None
-        # If there are unscheduled requests and the job queue
-        # is not full, schedule a new batch. Note that this is not blocking.
-        if (self.scheduler.get_num_unscheduled_requests() > 0
-                and not self.batch_queue.full()):
+        # Try to schedule a new batch if the batch queue is not full, but
+        # the scheduler may return an empty batch if all requests are scheduled.
+        # Note that this is not blocking.
+        if not self.batch_queue.full():
             scheduler_output = self.scheduler.schedule()
             if scheduler_output.total_num_scheduled_tokens > 0:
                 future = self.model_executor.execute_model(scheduler_output)
@@ -238,6 +238,10 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
 
         # If no more requests can be scheduled and the job queue is not empty,
         # block until the first batch in the job queue is finished.
+        # TODO(comaniac): Ideally we should peek the first batch in the
+        # job queue to check if it's finished before scheduling a new batch,
+        # but peeking the first element in a queue is not thread-safe,
+        # so we need more work.
         if not scheduled_batch and not self.batch_queue.empty():
             future, scheduler_output = self.batch_queue.get_nowait()
             # Blocking until the first result is available.

From b0c1f6202d2558c33bdcf252b0e7d47f9b5d19e3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 24 Apr 2025 21:14:32 +0800
Subject: [PATCH 44/81] [Misc] Remove OLMo2 config copy (#17066)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/olmo2.py         |   2 +-
 vllm/transformers_utils/config.py           |   8 +-
 vllm/transformers_utils/configs/__init__.py |   2 -
 vllm/transformers_utils/configs/olmo2.py    | 168 --------------------
 4 files changed, 4 insertions(+), 176 deletions(-)
 delete mode 100644 vllm/transformers_utils/configs/olmo2.py

diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index e7a5f91a3dae..44beae5726dc 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -28,6 +28,7 @@
 
 import torch
 from torch import nn
+from transformers import Olmo2Config
 
 from vllm.attention import Attention
 from vllm.config import VllmConfig
@@ -51,7 +52,6 @@
     make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.olmo2 import Olmo2Config
 
 
 class Olmo2Attention(nn.Module):
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4e2a31ce6729..c9f9af45044e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -36,10 +36,9 @@
                                              KimiVLConfig, MedusaConfig,
                                              MllamaConfig, MLPSpeculatorConfig,
                                              MPTConfig, NemotronConfig,
-                                             NVLM_D_Config, Olmo2Config,
-                                             RWConfig, SkyworkR1VChatConfig,
-                                             SolarConfig, Telechat2Config,
-                                             UltravoxConfig)
+                                             NVLM_D_Config, RWConfig,
+                                             SkyworkR1VChatConfig, SolarConfig,
+                                             Telechat2Config, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
@@ -76,7 +75,6 @@
     "internvl_chat": InternVLChatConfig,
     "nemotron": NemotronConfig,
     "NVLM_D": NVLM_D_Config,
-    "olmo2": Olmo2Config,
     "solar": SolarConfig,
     "skywork_chat": SkyworkR1VChatConfig,
     "telechat": Telechat2Config,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 739eea5cba51..8812d4c484b1 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -21,7 +21,6 @@
 from vllm.transformers_utils.configs.mpt import MPTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
-from vllm.transformers_utils.configs.olmo2 import Olmo2Config
 from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
 from vllm.transformers_utils.configs.solar import SolarConfig
 from vllm.transformers_utils.configs.telechat2 import Telechat2Config
@@ -46,7 +45,6 @@
     "KimiVLConfig",
     "NemotronConfig",
     "NVLM_D_Config",
-    "Olmo2Config",
     "SkyworkR1VChatConfig",
     "SolarConfig",
     "Telechat2Config",
diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py
deleted file mode 100644
index c6e446333b43..000000000000
--- a/vllm/transformers_utils/configs/olmo2.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# yapf: disable
-# ruff: noqa: E501
-# coding=utf-8
-# Copied from
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/configuration_olmo2.py
-"""OLMo 2 configuration."""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class Olmo2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50304):
-            Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Olmo2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 1):
-            Padding token id.
-        bos_token_id (`int`, *optional*):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 50279):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
-            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-
-    ```python
-    >>> from transformers import Olmo2Model, Olmo2Config
-
-    >>> # Initializing a Olmo2 7B style configuration
-    >>> configuration = Olmo2Config()
-
-    >>> # Initializing a model from the Olmo2 7B style configuration
-    >>> model = Olmo2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """
-
-    model_type = "olmo2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=50304,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        use_cache=True,
-        pad_token_id=1,
-        bos_token_id=None,
-        eos_token_id=50279,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        rms_norm_eps=1e-5,
-        **kwargs,
-    ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-
-        self.rms_norm_eps = rms_norm_eps
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

From 21f4f1c9a4df20ae4e744ad46a426aaf981b9412 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 24 Apr 2025 14:14:47 +0100
Subject: [PATCH 45/81] Improve static type checking in `LoRAModelRunnerMixin`
 (#17104)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/worker/lora_model_runner_mixin.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index a8a19e0e6206..3cbab840e969 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -28,20 +28,16 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
                         scheduler_config: SchedulerConfig,
                         lora_config: LoRAConfig, device: str) -> nn.Module:
 
-        assert supports_lora(
-            model), f"{model.__class__.__name__} does not support LoRA yet."
+        if not supports_lora(model):
+            raise ValueError(
+                f"{model.__class__.__name__} does not support LoRA yet.")
 
         if supports_multimodal(model):
             logger.warning("Regarding multimodal models, vLLM currently "
                            "only supports adding LoRA to language model.")
 
-        # It's necessary to distinguish between the max_position_embeddings
-        # of VLMs and LLMs.
-        if hasattr(model.config, "max_position_embeddings"):
-            max_pos_embeddings = model.config.max_position_embeddings
-        else:
-            max_pos_embeddings = (
-                model.config.text_config.max_position_embeddings)
+        # Use get_text_config() in case of multimodal models
+        text_config = model_config.hf_config.get_text_config()
 
         # Add LoRA Manager to the Model Runner
         self.lora_manager = LRUCacheWorkerLoRAManager(
@@ -52,7 +48,7 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
             device,
             model.embedding_modules,
             model.embedding_padding_modules,
-            max_position_embeddings=max_pos_embeddings,
+            max_position_embeddings=text_config.max_position_embeddings,
         )
         return self.lora_manager.create_lora_manager(model)
 

From b724afe343b788d61ca0425892f2631669f97f45 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Thu, 24 Apr 2025 21:15:03 +0800
Subject: [PATCH 46/81] [V1][Structured Output] Clear xgrammar compiler object
 when engine core shut down to avoid nanobind leaked warning (#16954)

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 vllm/v1/engine/core.py                        | 1 +
 vllm/v1/structured_output/__init__.py         | 4 ++++
 vllm/v1/structured_output/backend_guidance.py | 3 +++
 vllm/v1/structured_output/backend_types.py    | 6 ++++++
 vllm/v1/structured_output/backend_xgrammar.py | 3 +++
 5 files changed, 17 insertions(+)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 65b97eb4a72e..9590a9aadbec 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -253,6 +253,7 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
         return engine_core_outputs
 
     def shutdown(self):
+        self.structured_output_manager.clear_backend()
         if self.model_executor:
             self.model_executor.shutdown()
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 218af43deb67..0fd66c072960 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -107,3 +107,7 @@ def grammar_bitmask(
         # np.ndarray, because that is much more efficient for serialization
         # and deserialization when sending this to the GPU workers.
         return bitmask_tensor.numpy()
+
+    def clear_backend(self) -> None:
+        if self.backend is not None:
+            self.backend.destroy()
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index a59ec5efc53e..6d2ccd4019d4 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -108,6 +108,9 @@ def allocate_token_bitmask(self, max_num_seqs: int):
         return llguidance_torch.allocate_token_bitmask(
             max_num_seqs, self.ll_tokenizer.vocab_size)
 
+    def destroy(self):
+        pass
+
 
 @dataclass
 class GuidanceGrammar(StructuredOutputGrammar):
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 6dc2a92411de..306e4aa0196c 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -87,3 +87,9 @@ def allocate_token_bitmask(self, max_num_seqs: int):
             max_num_seqs (int): The maximum number of sequences for which
               to allocate the bitmask.
         """
+
+    @abstractmethod
+    def destroy(self):
+        """
+        Backend-specific cleanup.
+        """
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 0b5a1593b3eb..d978ae2da85c 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -124,6 +124,9 @@ def compile_grammar(self, request_type: StructuredOutputOptions,
     def allocate_token_bitmask(self, max_num_seqs: int):
         return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
 
+    def destroy(self):
+        del self.compiler
+
 
 @dataclass
 class XgrammarGrammar(StructuredOutputGrammar):

From 67309a1cb55f865f745b36501707f9f73bb023cc Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 24 Apr 2025 22:06:28 +0800
Subject: [PATCH 47/81] [Frontend] Using matryoshka_dimensions control the
 allowed output dimensions. (#16970)

---
 docs/source/models/pooling_models.md          |  10 +-
 .../openai_embedding_matryoshka_fy.py         |   4 +-
 tests/entrypoints/openai/test_embedding.py    |  42 +++---
 .../openai/test_embedding_dimensions.py       | 134 ++++++++++++------
 tests/models/embedding/language/test_jina.py  |  32 +++--
 tests/models/embedding/utils.py               |  21 ++-
 vllm/config.py                                |   4 +
 vllm/pooling_params.py                        |  11 +-
 8 files changed, 177 insertions(+), 81 deletions(-)

diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 5f1c2b5b4a3b..7daa0ec1de4d 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -159,14 +159,14 @@ For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model
 
 ### Manually enable Matryoshka Embeddings
 
-There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`.
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
 
-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online).
+For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf_overrides '{"is_matryoshka": true}'`,  `--hf_overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
 
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 
 ```text
-vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}'
+vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_dimensions":[256]}'
 ```
 
 ### Offline Inference
@@ -204,14 +204,14 @@ curl http://127.0.0.1:8000/v1/embeddings \
     "input": "Follow the white rabbit.",
     "model": "jinaai/jina-embeddings-v3",
     "encoding_format": "float",
-    "dimensions": 1
+    "dimensions": 32
   }'
 ```
 
 Expected output:
 
 ```json
-{"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}}
+{"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```
 
 A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py
index 27ab8cb64037..4544dcfb5ab0 100644
--- a/examples/online_serving/openai_embedding_matryoshka_fy.py
+++ b/examples/online_serving/openai_embedding_matryoshka_fy.py
@@ -25,11 +25,11 @@ def main():
     responses = client.embeddings.create(
         input=["Follow the white rabbit."],
         model=model,
-        dimensions=1,
+        dimensions=32,
     )
 
     for data in responses.data:
-        print(data.embedding)  # List of float of len 1
+        print(data.embedding)  # List of float of len 32
 
 
 if __name__ == "__main__":
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 2cdeb684f75d..50b20e78c4c4 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -11,11 +11,12 @@
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from ...models.embedding.utils import check_embeddings_close
+from ...models.embedding.utils import correctness_test
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+DTYPE = "bfloat16"
 
 
 @pytest.fixture(scope="module")
@@ -25,7 +26,7 @@ def server():
         "embed",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "bfloat16",
+        DTYPE,
         "--enforce-eager",
         "--max-model-len",
         "512",
@@ -43,9 +44,17 @@ async def client(server):
         yield async_client
 
 
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    with hf_runner(MODEL_NAME, dtype=DTYPE,
+                   is_sentence_transformer=True) as hf_model:
+        yield hf_model
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
+async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
+                                model_name: str):
     input_texts = [
         "The chef prepared a delicious meal.",
     ]
@@ -66,6 +75,9 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
     assert embeddings.usage.prompt_tokens == 11
     assert embeddings.usage.total_tokens == 11
 
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    correctness_test(hf_model, input_texts, vllm_outputs)
+
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
     embedding_response = await client.embeddings.create(
@@ -86,7 +98,8 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
+async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
+                               model_name: str):
     # test list[str]
     input_texts = [
         "The cat sat on the mat.", "A feline was resting on a rug.",
@@ -107,6 +120,9 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
     assert embeddings.usage.prompt_tokens == 33
     assert embeddings.usage.total_tokens == 33
 
+    vllm_outputs = [d.embedding for d in embeddings.data]
+    correctness_test(hf_model, input_texts, vllm_outputs)
+
     # test list[list[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                     [25, 32, 64, 77]]
@@ -181,7 +197,7 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
+async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
                                       model_name: str):
     input_texts = [
         "Hello my name is",
@@ -192,6 +208,7 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
                                                      model=model_name,
                                                      encoding_format="float")
     float_data = [d.embedding for d in responses_float.data]
+    correctness_test(hf_model, input_texts, float_data)
 
     responses_base64 = await client.embeddings.create(input=input_texts,
                                                       model=model_name,
@@ -202,24 +219,13 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
             np.frombuffer(base64.b64decode(data.embedding),
                           dtype="float32").tolist())
 
-    check_embeddings_close(
-        embeddings_0_lst=float_data,
-        embeddings_1_lst=base64_data,
-        name_0="float",
-        name_1="base64",
-    )
+    correctness_test(hf_model, input_texts, base64_data)
 
     # Default response is float32 decoded from base64 by OpenAI Client
     responses_default = await client.embeddings.create(input=input_texts,
                                                        model=model_name)
     default_data = [d.embedding for d in responses_default.data]
-
-    check_embeddings_close(
-        embeddings_0_lst=float_data,
-        embeddings_1_lst=default_data,
-        name_0="float",
-        name_1="default",
-    )
+    correctness_test(hf_model, input_texts, default_data)
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py
index 43d109f74f5d..9f5a8c6839bc 100644
--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -3,73 +3,121 @@
 Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`.
 """
 
+from typing import Optional
+
 import openai
 import pytest
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 
-from ...models.embedding.utils import EmbedModelInfo
+from ...conftest import HfRunner
+from ...models.embedding.utils import EmbedModelInfo, correctness_test
 from ...utils import RemoteOpenAIServer
 
 MODELS = [
-    EmbedModelInfo(name="BAAI/bge-m3", is_matryoshka=False),
-    EmbedModelInfo(name="jinaai/jina-embeddings-v3", is_matryoshka=True),
+    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
+    EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
+                   is_matryoshka=True,
+                   matryoshka_dimensions=[256]),
 ]
 
 input_texts = [
     "The chef prepared a delicious meal.",
-] * 3
+]
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model", MODELS)
-async def test_validating_dimensions(model: EmbedModelInfo):
+@pytest.fixture(scope="module", params=MODELS)
+def model_info(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["bfloat16"])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def server(model_info, dtype: str):
     args = [
         "--task",
         "embed",
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "bfloat16",
+        dtype,
         "--enforce-eager",
         "--max-model-len",
-        "512",
-        "--trust_remote_code"
+        "512"
     ]
-    with RemoteOpenAIServer(model.name, args) as remote_server:
-        client = remote_server.get_async_client()
-
-        async def make_request(dimensions):
-            embedding_response = await client.embeddings.create(
-                model=model.name,
-                input=input_texts,
-                dimensions=dimensions,
-                encoding_format="float",
-            )
-            embeddings = EmbeddingResponse.model_validate(
-                embedding_response.model_dump(mode="json"))
-
-            assert embeddings.id is not None
-            assert len(embeddings.data) == 3
-            assert len(embeddings.data[0].embedding) > 0
-            assert embeddings.usage.completion_tokens == 0
-            assert embeddings.usage.prompt_tokens > 0
-            assert embeddings.usage.total_tokens > 0
-
-            if dimensions is not None:
-                assert len(embeddings.data[0].embedding) == dimensions
-
-        if model.is_matryoshka:
-            for dimensions in [None, 16]:
-                await make_request(dimensions)
 
+    if model_info.name == "Snowflake/snowflake-arctic-embed-m-v1.5":
+        # Manually enable Matryoshka Embeddings
+        args.extend([
+            "--trust_remote_code", "--hf_overrides",
+            '{"matryoshka_dimensions":[256]}'
+        ])
+
+    with RemoteOpenAIServer(model_info.name, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner, model_info, dtype: str):
+    with hf_runner(model_info.name, dtype=dtype,
+                   is_sentence_transformer=True) as hf_model:
+        yield hf_model
+
+
+@pytest.mark.asyncio
+async def test_matryoshka(model_info: EmbedModelInfo,
+                          server: RemoteOpenAIServer, hf_model: HfRunner):
+    client = server.get_async_client()
+
+    async def make_request_and_correctness_test(dimensions):
+        prompts = input_texts * 3
+
+        embedding_response = await client.embeddings.create(
+            model=model_info.name,
+            input=prompts,
+            dimensions=dimensions,
+            encoding_format="float",
+        )
+        embeddings = EmbeddingResponse.model_validate(
+            embedding_response.model_dump(mode="json"))
+
+        assert embeddings.id is not None
+        assert len(embeddings.data) == 3
+        assert len(embeddings.data[0].embedding) > 0
+        assert embeddings.usage.completion_tokens == 0
+        assert embeddings.usage.prompt_tokens > 0
+        assert embeddings.usage.total_tokens > 0
+
+        if dimensions is not None:
+            assert len(embeddings.data[0].embedding) == dimensions
+
+        vllm_outputs = [d.embedding for d in embeddings.data]
+        correctness_test(hf_model, prompts, vllm_outputs, dimensions)
+
+    if model_info.is_matryoshka:
+        valid_dimensions: list[Optional[int]] = [None]
+        if model_info.matryoshka_dimensions is not None:
+            valid_dimensions += model_info.matryoshka_dimensions[:2]
+
+        for dimensions in valid_dimensions:
+            await make_request_and_correctness_test(dimensions)
+
+        invalid_dimensions: list[Optional[int]] = [-1]
+        if model_info.matryoshka_dimensions is not None:
+            assert 5 not in model_info.matryoshka_dimensions
+            invalid_dimensions.append(5)
+
+        for dimensions in invalid_dimensions:
             with pytest.raises(openai.BadRequestError):
-                for dimensions in [-1]:
-                    await make_request(dimensions)
+                await make_request_and_correctness_test(dimensions)
 
-        else:
-            for dimensions in [None]:
-                await make_request(dimensions)
+    else:
+        for dimensions in [None]:
+            await make_request_and_correctness_test(dimensions)
 
+        for dimensions in [-1, 16]:
             with pytest.raises(openai.BadRequestError):
-                for dimensions in [-1, 16]:
-                    await make_request(dimensions)
+                await make_request_and_correctness_test(dimensions)
diff --git a/tests/models/embedding/language/test_jina.py b/tests/models/embedding/language/test_jina.py
index 881d0a75b158..1e234368f3b3 100644
--- a/tests/models/embedding/language/test_jina.py
+++ b/tests/models/embedding/language/test_jina.py
@@ -153,14 +153,24 @@ def test_matryoshka(
 
     with vllm_runner(model, task="embed", dtype=dtype,
                      max_model_len=None) as vllm_model:
-        vllm_outputs = vllm_model.encode(
-            example_prompts,
-            pooling_params=PoolingParams(dimensions=dimensions))
-
-    check_embeddings_close(
-        embeddings_0_lst=hf_outputs,
-        embeddings_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-        tol=1e-2,
-    )
+        matryoshka_dimensions = (
+            vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
+        assert matryoshka_dimensions is not None
+
+        if dimensions not in matryoshka_dimensions:
+            with pytest.raises(ValueError):
+                vllm_model.encode(
+                    example_prompts,
+                    pooling_params=PoolingParams(dimensions=dimensions))
+        else:
+            vllm_outputs = vllm_model.encode(
+                example_prompts,
+                pooling_params=PoolingParams(dimensions=dimensions))
+
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index a58116e2bf0d..6d4df2c265c4 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Sequence
-from typing import NamedTuple
+from typing import NamedTuple, Optional
 
 import torch
 import torch.nn.functional as F
@@ -43,5 +43,24 @@ def matryoshka_fy(tensor, dimensions):
 class EmbedModelInfo(NamedTuple):
     name: str
     is_matryoshka: bool
+    matryoshka_dimensions: Optional[list[int]] = None
     architecture: str = ""
     enable_test: bool = True
+
+
+def correctness_test(hf_model,
+                     inputs,
+                     vllm_outputs: Sequence[list[float]],
+                     dimensions: Optional[int] = None):
+
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
diff --git a/vllm/config.py b/vllm/config.py
index 12e6747c79ff..3e5a17802f0f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1248,6 +1248,10 @@ def is_matryoshka(self) -> bool:
         return (hasattr(self.hf_config, "matryoshka_dimensions")
                 or getattr(self.hf_config, "is_matryoshka", False))
 
+    @property
+    def matryoshka_dimensions(self):
+        return getattr(self.hf_config, "matryoshka_dimensions", None)
+
 
 BlockSize = Literal[1, 8, 16, 32, 64, 128]
 CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"]
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index f71daf0c1955..9a3b254f9b68 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -35,7 +35,16 @@ def verify(self, model_config: "ModelConfig") -> None:
                     f'Model "{model_config.served_model_name}" does not '
                     f'support matryoshka representation, '
                     f'changing output dimensions will lead to poor results.')
-            if self.dimensions < 1:
+
+            mds = model_config.matryoshka_dimensions
+            if mds is not None:
+                if self.dimensions not in mds:
+                    raise ValueError(
+                        f'Model "{model_config.served_model_name}" '
+                        f'only supports {str(mds)} matryoshka dimensions, '
+                        f'use other output dimensions will '
+                        f'lead to poor results.')
+            elif self.dimensions < 1:
                 raise ValueError("Dimensions must be greater than 0")
 
     def __repr__(self) -> str:

From 82e43b2d7e00dfffef4101c3ceb9e7893bf13abb Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 24 Apr 2025 08:49:37 -0600
Subject: [PATCH 48/81] Add missing rocm_skinny_gemms kernel test to CI
 (#17060)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 tests/kernels/quant_utils.py                  | 60 ++++++++++++++++++
 tests/kernels/quantization/test_block_fp8.py  |  2 +-
 tests/kernels/quantization/test_block_int8.py |  2 +-
 .../test_rocm_skinny_gemms.py                 |  0
 tests/kernels/utils_block.py                  | 63 -------------------
 5 files changed, 62 insertions(+), 65 deletions(-)
 rename tests/kernels/{ => quantization}/test_rocm_skinny_gemms.py (100%)
 delete mode 100644 tests/kernels/utils_block.py

diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 498da6001ae9..764924f26783 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -87,3 +87,63 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
     ref_out = (as_float32_tensor(x) * ref_iscale).clamp(
         fp8_traits_min, fp8_traits_max).to(FP8_DTYPE)
     return ref_out, ref_scale.view((1, ))
+
+
+def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
+                             As: torch.Tensor, Bs: torch.Tensor, block_size,
+                             output_dtype):
+    """This function performs matrix multiplication with block-wise
+    quantization using native torch.
+    It is agnostic to the input data type and can be used for both int8 and
+    fp8 data types.
+
+    It takes two input tensors `A` and `B` (int8) with scales `As` and
+    `Bs` (float32).
+    The output is returned in the specified `output_dtype`.
+    """
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[
+            j * block_n:min((j + 1) * block_n, N),
+            i * block_k:min((i + 1) * block_k, K),
+        ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index da594675e924..c57e39f42506 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 
-from tests.kernels.utils_block import native_w8a8_block_matmul
+from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index 943470ad113d..104f23fd7cd2 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 
-from tests.kernels.utils_block import native_w8a8_block_matmul
+from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
diff --git a/tests/kernels/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
similarity index 100%
rename from tests/kernels/test_rocm_skinny_gemms.py
rename to tests/kernels/quantization/test_rocm_skinny_gemms.py
diff --git a/tests/kernels/utils_block.py b/tests/kernels/utils_block.py
deleted file mode 100644
index c16cba50967e..000000000000
--- a/tests/kernels/utils_block.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-
-
-def native_w8a8_block_matmul(A: torch.Tensor, B: torch.Tensor,
-                             As: torch.Tensor, Bs: torch.Tensor, block_size,
-                             output_dtype):
-    """This function performs matrix multiplication with block-wise
-    quantization using native torch.
-    It is agnostic to the input data type and can be used for both int8 and
-    fp8 data types.
-
-    It takes two input tensors `A` and `B` (int8) with scales `As` and 
-    `Bs` (float32).
-    The output is returned in the specified `output_dtype`.
-    """
-    A = A.to(torch.float32)
-    B = B.to(torch.float32)
-    assert A.shape[-1] == B.shape[-1]
-    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
-    assert len(block_size) == 2
-    block_n, block_k = block_size[0], block_size[1]
-    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
-    assert A.shape[:-1] == As.shape[:-1]
-
-    M = A.numel() // A.shape[-1]
-    N, K = B.shape
-    origin_C_shape = A.shape[:-1] + (N, )
-    A = A.reshape(M, A.shape[-1])
-    As = As.reshape(M, As.shape[-1])
-    n_tiles = (N + block_n - 1) // block_n
-    k_tiles = (K + block_k - 1) // block_k
-    assert n_tiles == Bs.shape[0]
-    assert k_tiles == Bs.shape[1]
-
-    C_shape = (M, N)
-    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
-
-    A_tiles = [
-        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
-    ]
-    B_tiles = [[
-        B[
-            j * block_n:min((j + 1) * block_n, N),
-            i * block_k:min((i + 1) * block_k, K),
-        ] for i in range(k_tiles)
-    ] for j in range(n_tiles)]
-    C_tiles = [
-        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
-    ]
-    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
-
-    for i in range(k_tiles):
-        for j in range(n_tiles):
-            a = A_tiles[i]
-            b = B_tiles[j][i]
-            c = C_tiles[j]
-            s = As_tiles[i] * Bs[j][i]
-            c[:, :] += torch.matmul(a, b.t()) * s
-
-    C = C.reshape(origin_C_shape).to(output_dtype)
-    return C

From 1bcbcbf57417e6fa0427bc60f6acd11b86cb857f Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 24 Apr 2025 22:49:48 +0800
Subject: [PATCH 49/81] [Misc] refactor example series - structured outputs
 (#17040)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
---
 ...enai_chat_completion_structured_outputs.py | 194 +++++++++++-------
 ...etion_structured_outputs_with_reasoning.py | 158 ++++++++------
 2 files changed, 209 insertions(+), 143 deletions(-)

diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index 1b690d19b4e8..f71162e36efd 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -1,43 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
+"""
+To run this example, you need to start the vLLM server:
+
+```bash
+vllm serve Qwen/Qwen2.5-3B-Instruct
+```
+"""
 
 from enum import Enum
 
 from openai import BadRequestError, OpenAI
 from pydantic import BaseModel
 
-client = OpenAI(
-    base_url="http://localhost:8000/v1",
-    api_key="-",
-)
 
 # Guided decoding by Choice (list of possible options)
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": "Classify this sentiment: vLLM is wonderful!"
-    }],
-    extra_body={"guided_choice": ["positive", "negative"]},
-)
-print(completion.choices[0].message.content)
+def guided_choice_completion(client: OpenAI, model: str):
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": "Classify this sentiment: vLLM is wonderful!"
+        }],
+        extra_body={"guided_choice": ["positive", "negative"]},
+    )
+    return completion.choices[0].message.content
+
 
 # Guided decoding by Regex
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-          "End in .com and new line. Example result:"
-          "alan.turing@enigma.com\n")
-
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={
-        "guided_regex": r"\w+@\w+\.com\n",
-        "stop": ["\n"]
-    },
-)
-print(completion.choices[0].message.content)
+def guided_regex_completion(client: OpenAI, model: str):
+    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+              "End in .com and new line. Example result:"
+              "alan.turing@enigma.com\n")
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={
+            "guided_regex": r"\w+@\w+\.com\n",
+            "stop": ["\n"]
+        },
+    )
+    return completion.choices[0].message.content
 
 
 # Guided decoding by JSON using Pydantic schema
@@ -54,66 +60,100 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-json_schema = CarDescription.model_json_schema()
-
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print(completion.choices[0].message.content)
+def guided_json_completion(client: OpenAI, model: str):
+    json_schema = CarDescription.model_json_schema()
 
-# Guided decoding by Grammar
-simplified_sql_grammar = """
-    root ::= select_statement
+    prompt = ("Generate a JSON with the brand, model and car_type of"
+              "the most iconic car from the 90's")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_json": json_schema},
+    )
+    return completion.choices[0].message.content
 
-    select_statement ::= "SELECT " column " from " table " where " condition
 
-    column ::= "col_1 " | "col_2 "
+# Guided decoding by Grammar
+def guided_grammar_completion(client: OpenAI, model: str):
+    simplified_sql_grammar = """
+        root ::= select_statement
 
-    table ::= "table_1 " | "table_2 "
+        select_statement ::= "SELECT " column " from " table " where " condition
 
-    condition ::= column "= " number
+        column ::= "col_1 " | "col_2 "
 
-    number ::= "1 " | "2 "
-"""
+        table ::= "table_1 " | "table_2 "
 
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
-          "from the 'users' table.")
-completion = client.chat.completions.create(
-    model="Qwen/Qwen2.5-3B-Instruct",
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_grammar": simplified_sql_grammar},
-)
-print(completion.choices[0].message.content)
+        condition ::= column "= " number
 
-# Extra backend options
-prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-          "End in .com and new line. Example result:"
-          "alan.turing@enigma.com\n")
+        number ::= "1 " | "2 "
+    """
 
-try:
-    # The no-fallback option forces vLLM to use xgrammar, so when it fails
-    # you get a 400 with the reason why
+    prompt = ("Generate an SQL query to show the 'username' and 'email'"
+              "from the 'users' table.")
     completion = client.chat.completions.create(
-        model="Qwen/Qwen2.5-3B-Instruct",
+        model=model,
         messages=[{
             "role": "user",
             "content": prompt,
         }],
-        extra_body={
-            "guided_regex": r"\w+@\w+\.com\n",
-            "stop": ["\n"],
-            "guided_decoding_backend": "xgrammar:no-fallback"
-        },
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    return completion.choices[0].message.content
+
+
+# Extra backend options
+def extra_backend_options_completion(client: OpenAI, model: str):
+    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+              "End in .com and new line. Example result:"
+              "alan.turing@enigma.com\n")
+
+    try:
+        # The no-fallback option forces vLLM to use xgrammar, so when it fails
+        # you get a 400 with the reason why
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[{
+                "role": "user",
+                "content": prompt,
+            }],
+            extra_body={
+                "guided_regex": r"\w+@\w+\.com\n",
+                "stop": ["\n"],
+                "guided_decoding_backend": "xgrammar:no-fallback"
+            },
+        )
+        return completion.choices[0].message.content
+    except BadRequestError as e:
+        print("This error is expected:", e)
+
+
+def main():
+    client: OpenAI = OpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="-",
     )
-except BadRequestError as e:
-    print("This error is expected:", e)
+
+    model = "Qwen/Qwen2.5-3B-Instruct"
+
+    print("Guided Choice Completion:")
+    print(guided_choice_completion(client, model))
+
+    print("\nGuided Regex Completion:")
+    print(guided_regex_completion(client, model))
+
+    print("\nGuided JSON Completion:")
+    print(guided_json_completion(client, model))
+
+    print("\nGuided Grammar Completion:")
+    print(guided_grammar_completion(client, model))
+
+    print("\nExtra Backend Options Completion:")
+    print(extra_backend_options_completion(client, model))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index be634401679c..cb7f30d93255 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -25,29 +25,28 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
 
-models = client.models.list()
-model = models.data[0].id
+def print_completion_details(completion):
+    print("reasoning_content: ",
+          completion.choices[0].message.reasoning_content)
+    print("content: ", completion.choices[0].message.content)
+
 
 # Guided decoding by Regex
-prompt = ("What is the capital of France?")
-
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={
-        "guided_regex": "(Paris|London)",
-    },
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
+def guided_regex_completion(client: OpenAI, model: str):
+    prompt = ("What is the capital of France?")
+
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={
+            "guided_regex": "(Paris|London)",
+        },
+    )
+    print_completion_details(completion)
 
 
 class People(BaseModel):
@@ -55,19 +54,19 @@ class People(BaseModel):
     age: int
 
 
-json_schema = People.model_json_schema()
+def guided_json_completion(client: OpenAI, model: str):
+    json_schema = People.model_json_schema()
 
-prompt = ("Generate a JSON with the name and age of one random person.")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
+    prompt = ("Generate a JSON with the name and age of one random person.")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_json": json_schema},
+    )
+    print_completion_details(completion)
 
 
 # Guided decoding by JSON using Pydantic schema
@@ -84,46 +83,73 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-json_schema = CarDescription.model_json_schema()
+def guided_car_json_completion(client: OpenAI, model: str):
+    json_schema = CarDescription.model_json_schema()
+
+    prompt = ("Generate a JSON with the brand, model and car_type of"
+              "the most iconic car from the 90's")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_json": json_schema},
+    )
+    print_completion_details(completion)
 
-prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_json": json_schema},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
 
 # Guided decoding by Grammar
-simplified_sql_grammar = """
-    root ::= select_statement
+def guided_grammar_completion(client: OpenAI, model: str):
+    simplified_sql_grammar = """
+        root ::= select_statement
 
-    select_statement ::= "SELECT " column " from " table " where " condition
+        select_statement ::= "SELECT " column " from " table " where " condition
 
-    column ::= "col_1 " | "col_2 "
+        column ::= "col_1 " | "col_2 "
 
-    table ::= "table_1 " | "table_2 "
+        table ::= "table_1 " | "table_2 "
 
-    condition ::= column "= " number
+        condition ::= column "= " number
+
+        number ::= "1 " | "2 "
+    """
+
+    # This may be very slow https://github.com/vllm-project/vllm/issues/12122
+    prompt = ("Generate an SQL query to show the 'username' and 'email'"
+              "from the 'users' table.")
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={"guided_grammar": simplified_sql_grammar},
+    )
+    print_completion_details(completion)
+
+
+def main():
+    client: OpenAI = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model: str = models.data[0].id
+
+    print("Guided Regex Completion:")
+    guided_regex_completion(client, model)
+
+    print("\nGuided JSON Completion (People):")
+    guided_json_completion(client, model)
+
+    print("\nGuided JSON Completion (CarDescription):")
+    guided_car_json_completion(client, model)
+
+    print("\nGuided Grammar Completion:")
+    guided_grammar_completion(client, model)
 
-    number ::= "1 " | "2 "
-"""
 
-# This may be very slow https://github.com/vllm-project/vllm/issues/12122
-prompt = ("Generate an SQL query to show the 'username' and 'email'"
-          "from the 'users' table.")
-completion = client.chat.completions.create(
-    model=model,
-    messages=[{
-        "role": "user",
-        "content": prompt,
-    }],
-    extra_body={"guided_grammar": simplified_sql_grammar},
-)
-print("reasoning_content: ", completion.choices[0].message.reasoning_content)
-print("content: ", completion.choices[0].message.content)
+if __name__ == "__main__":
+    main()

From 340d7b1b217794c1b89e13936c66b920fd8d842d Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 24 Apr 2025 16:57:40 +0100
Subject: [PATCH 50/81] [V1][Spec Decoding] Add num_drafts and
 num_accepted_tokens_per_position metrics (#16665)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/v1/core/test_scheduler.py |  39 ++++++----
 vllm/v1/core/sched/scheduler.py |  16 ++--
 vllm/v1/metrics/loggers.py      |  35 +++------
 vllm/v1/spec_decode/metrics.py  | 128 ++++++++++++++++++++++++++++----
 4 files changed, 158 insertions(+), 60 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 003f94259e2d..591284ec8541 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -6,7 +6,7 @@
 import torch
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
-                         SchedulerConfig, VllmConfig)
+                         SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import SchedulerOutput
@@ -31,6 +31,7 @@ def create_scheduler(
     num_blocks: int = 10000,
     block_size: int = 16,
     max_model_len: Optional[int] = None,
+    num_speculative_tokens: Optional[int] = None,
 ) -> Scheduler:
     '''Create scheduler under test.
 
@@ -81,11 +82,17 @@ def create_scheduler(
         kv_connector_extra_config={"shared_storage_path": "local_storage"},
     ) if use_kv_connector else None
 
+    speculative_config: Optional[SpeculativeConfig] = None
+    if num_speculative_tokens is not None:
+        speculative_config = SpeculativeConfig(
+            model="ngram", num_speculative_tokens=num_speculative_tokens)
+
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
         model_config=model_config,
         cache_config=cache_config,
         kv_transfer_config=kv_transfer_config,
+        speculative_config=speculative_config,
     )
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,  # A large number of blocks to hold all requests
@@ -429,7 +436,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
 
 def test_stop_via_update_from_output():
     """Test stopping behavior through update_from_output"""
-    scheduler = create_scheduler()
+    scheduler = create_scheduler(num_speculative_tokens=1)
 
     # Test case 1: Stop on EOS token
     requests = create_requests(num_requests=2, max_tokens=10)
@@ -480,7 +487,7 @@ def test_stop_via_update_from_output():
     assert list(requests[1].output_token_ids) == [10, 11]
 
     # Test case 2: Stop on custom stop token
-    scheduler = create_scheduler()
+    scheduler = create_scheduler(num_speculative_tokens=2)
     requests = create_requests(num_requests=2,
                                max_tokens=10,
                                stop_token_ids=[42, 43])
@@ -531,7 +538,7 @@ def test_stop_via_update_from_output():
     assert list(requests[1].output_token_ids) == [13, 14]
 
     # Test case 3: Stop on max tokens
-    scheduler = create_scheduler()
+    scheduler = create_scheduler(num_speculative_tokens=2)
     requests = create_requests(num_requests=2, max_tokens=2)
     for req in requests:
         req.num_computed_tokens = req.num_tokens
@@ -580,7 +587,7 @@ def test_stop_via_update_from_output():
     assert list(requests[1].output_token_ids) == [13]
 
     # Test case 4: Ignore EOS flag
-    scheduler = create_scheduler()
+    scheduler = create_scheduler(num_speculative_tokens=2)
     requests = create_requests(num_requests=1, max_tokens=10)
     requests[0].sampling_params.ignore_eos = True
     requests[0].num_computed_tokens = requests[0].num_tokens
@@ -682,13 +689,14 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2, 3]], [[1, 2, 3, 4]], (3, 3)),  # perfect match
-        ([[1, 2, 3]], [[1, 5]], (3, 1)),  # early mismatch
-        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (3, 3)),  # multiple sequences
-        ([[1]], [[1, 2]], (1, 1)),  # single token sequence
-        ([[]], [[5]], (0, 0)),  # empty sequence
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]],
+         (2, 3, 3, [2, 1])),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
         ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
-         (6, 3)),  # multiple mismatches
+         (2, 6, 3, [2, 1, 0])),  # multiple mismatches
     ])
 def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     """Test scheduling behavior with speculative decoding.
@@ -697,7 +705,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     1. Speculated tokens get scheduled correctly
     2. Spec decoding stats properly count number of draft and accepted tokens
     """
-    scheduler = create_scheduler()
+    num_spec_tokens = max(1, max(len(t) for t in spec_tokens))
+    scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens)
     requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
     req_ids = []
     req_to_index = {}
@@ -770,8 +779,10 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     else:
         assert scheduler_stats.spec_decoding_stats is not None
         stats = scheduler_stats.spec_decoding_stats
-        assert stats.num_draft_tokens == expected[0]
-        assert stats.num_accepted_tokens == expected[1]
+        assert stats.num_drafts == expected[0]
+        assert stats.num_draft_tokens == expected[1]
+        assert stats.num_accepted_tokens == expected[2]
+        assert stats.num_accepted_tokens_per_pos == expected[3]
 
 
 def _assert_right_scheduler_output(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 5805950f39b0..44dd9b026c2d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -122,11 +122,12 @@ def __init__(
         self.encoder_cache_manager = EncoderCacheManager(
             cache_size=encoder_cache_size)
 
-        self.num_lookahead_tokens = 0
         speculative_config = vllm_config.speculative_config
-        if speculative_config and speculative_config.method == "eagle":
-            self.num_lookahead_tokens = \
-                speculative_config.num_speculative_tokens
+        self.num_spec_tokens = self.num_lookahead_tokens = 0
+        if speculative_config:
+            self.num_spec_tokens = speculative_config.num_speculative_tokens
+            if speculative_config.method == "eagle":
+                self.num_lookahead_tokens = self.num_spec_tokens
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -824,7 +825,8 @@ def make_spec_decoding_stats(
         if not self.log_stats:
             return None
         if spec_decoding_stats is None:
-            spec_decoding_stats = SpecDecodingStats()
-        spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens,
-                                    num_accepted_tokens=num_accepted_tokens)
+            spec_decoding_stats = SpecDecodingStats.new(self.num_spec_tokens)
+        spec_decoding_stats.observe_draft(
+            num_draft_tokens=num_draft_tokens,
+            num_accepted_tokens=num_accepted_tokens)
         return spec_decoding_stats
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 4d70f27f8080..547e60467632 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -12,7 +12,7 @@
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
-from vllm.v1.spec_decode.metrics import SpecDecodingMetrics
+from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
 
 logger = init_logger(__name__)
 
@@ -39,7 +39,7 @@ def __init__(self, engine_index: int = 0):
         # Prefix cache metrics. This cannot be reset.
         # TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
-        self.spec_decoding_metrics = SpecDecodingMetrics()
+        self.spec_decoding_logging = SpecDecodingLogging()
         self.last_prompt_throughput: float = 0.0
         self.last_generation_throughput: float = 0.0
 
@@ -70,7 +70,7 @@ def record(self, scheduler_stats: SchedulerStats,
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
         if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_metrics.observe(
+            self.spec_decoding_logging.observe(
                 scheduler_stats.spec_decoding_stats)
 
         self.last_scheduler_stats = scheduler_stats
@@ -112,7 +112,7 @@ def log(self):
         )
 
         if scheduler_stats.spec_decoding_stats is not None:
-            self.spec_decoding_metrics.log(log_fn=log_fn)
+            self.spec_decoding_logging.log(log_fn=log_fn)
 
 
 class PrometheusStatLogger(StatLoggerBase):
@@ -133,6 +133,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
 
         max_model_len = vllm_config.model_config.max_model_len
 
+        self.spec_decoding_prom = SpecDecodingProm(
+            vllm_config.speculative_config, labelnames, labelvalues)
+
         #
         # Scheduler state
         #
@@ -323,24 +326,6 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
                         self.labelname_running_lora_adapters,
                     ])
 
-        #
-        # Speculative Decoding metrics
-        # The acceptance rate can be calculated using a PromQL query:
-        #
-        #   rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
-        #   rate(vllm:spec_decode_num_draft_tokens_total[$interval])
-        #
-        self.counter_spec_decode_num_draft_tokens = \
-            prometheus_client.Counter(
-                name="vllm:spec_decode_num_draft_tokens_total",
-                documentation="Number of draft tokens.",
-                labelnames=labelnames).labels(*labelvalues)
-        self.counter_spec_decode_num_accepted_tokens = \
-            prometheus_client.Counter(
-                name="vllm:spec_decode_num_accepted_tokens_total",
-                documentation="Number of accepted tokens.",
-                labelnames=labelnames).labels(*labelvalues)
-
         #
         # Cache config info metric
         #
@@ -378,10 +363,8 @@ def record(self, scheduler_stats: SchedulerStats,
             scheduler_stats.prefix_cache_stats.hits)
 
         if scheduler_stats.spec_decoding_stats is not None:
-            self.counter_spec_decode_num_draft_tokens.inc(
-                scheduler_stats.spec_decoding_stats.num_draft_tokens)
-            self.counter_spec_decode_num_accepted_tokens.inc(
-                scheduler_stats.spec_decoding_stats.num_accepted_tokens)
+            self.spec_decoding_prom.observe(
+                scheduler_stats.spec_decoding_stats)
 
         if iteration_stats is None:
             return
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index cc453b74f7eb..33ce98284e20 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import Optional
 
 import numpy as np
+import prometheus_client
 
+from vllm.config import SpeculativeConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -11,52 +14,151 @@
 
 @dataclass
 class SpecDecodingStats:
+    """Per-step iteration decoding stats from scheduler.
+
+    Each scheduler step, statistics on spec decoding performance are
+    aggregated across requests by the scheduler and returned to the
+    frontend in EngineCoreOutputs->SchedulerStats.
+    """
+
+    num_spec_tokens: int
+    num_drafts: int = 0
     num_draft_tokens: int = 0
     num_accepted_tokens: int = 0
+    num_accepted_tokens_per_pos: list[int] = field(default_factory=list)
 
-    def take(self):
-        copied = SpecDecodingStats(self.num_draft_tokens,
-                                   self.num_accepted_tokens)
-        self.reset()
-        return copied
-
-    def reset(self):
-        self.num_draft_tokens = 0
-        self.num_accepted_tokens = 0
+    @classmethod
+    def new(cls, num_spec_tokens: int) -> "SpecDecodingStats":
+        return cls(num_spec_tokens=num_spec_tokens,
+                   num_accepted_tokens_per_pos=[0] * num_spec_tokens)
 
-    def observe(self, num_draft_tokens: int, num_accepted_tokens: int):
+    def observe_draft(self, num_draft_tokens: int, num_accepted_tokens: int):
+        self.num_drafts += 1
         self.num_draft_tokens += num_draft_tokens
         self.num_accepted_tokens += num_accepted_tokens
+        assert num_accepted_tokens <= self.num_spec_tokens
+        for i in range(num_accepted_tokens):
+            self.num_accepted_tokens_per_pos[i] += 1
+
 
+class SpecDecodingLogging:
+    """Aggregate and log spec decoding metrics.
 
-class SpecDecodingMetrics:
+    LoggingStatLogger aggregates per-iteration metrics over a set
+    time interval using observe() and then logs them using log()
+    before resetting to zero.
+    """
 
     def __init__(self):
         self.reset()
 
     def reset(self):
+        self.num_drafts: list[int] = []
         self.num_draft_tokens: list[int] = []
         self.num_accepted_tokens: list[int] = []
+        self.accepted_tokens_per_pos_lists: list[list[int]] = []
 
     def observe(self, spec_decoding_stats: SpecDecodingStats):
+        self.num_drafts.append(spec_decoding_stats.num_drafts)
         self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
         self.num_accepted_tokens.append(
             spec_decoding_stats.num_accepted_tokens)
+        self.accepted_tokens_per_pos_lists.append(
+            spec_decoding_stats.num_accepted_tokens_per_pos)
 
     def log(self, log_fn=logger.info):
+        num_drafts = np.sum(self.num_drafts)
         num_draft_tokens = np.sum(self.num_draft_tokens)
         num_accepted_tokens = np.sum(self.num_accepted_tokens)
 
         draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
                                  100 if num_draft_tokens > 0 else float("nan"))
+        mean_acceptance_length = (num_accepted_tokens / num_drafts)
+
+        pos_matrix = np.array(self.accepted_tokens_per_pos_lists)
+        acceptance_rates = np.sum(pos_matrix, axis=0) / num_drafts
+        rates_str = ", ".join(f"{p:.3f}" for p in acceptance_rates)
 
         log_fn(
             "SpecDecoding metrics: "
             "Draft acceptance rate: %.1f%%, "
+            "Mean acceptance length: %.2f, "
             "Accepted: %d tokens, "
-            "Drafted: %d tokens",
+            "Drafted: %d tokens, "
+            "Per-position acceptance rate: %s",
             draft_acceptance_rate,
+            mean_acceptance_length,
             num_accepted_tokens,
             num_draft_tokens,
+            rates_str,
         )
         self.reset()
+
+
+class SpecDecodingProm:
+    """Record spec decoding metrics in Prometheus.
+
+    The acceptance rate can be calculated using a PromQL query:
+
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_draft_tokens_total[$interval])
+
+    The mean acceptance length can be calculated using:
+
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_drafts[$interval])
+
+    A per-position acceptance rate vector can be computed using
+
+      vllm:spec_decode_num_accepted_tokens_per_pos[$interval] /
+      vllm:spec_decode_num_drafts[$interval]
+    """
+
+    def __init__(self, speculative_config: Optional[SpeculativeConfig],
+                 labelnames: list[str], labelvalues: list[str]):
+        self.spec_decoding_enabled = speculative_config is not None
+        if not self.spec_decoding_enabled:
+            return
+
+        self.counter_spec_decode_num_drafts = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_drafts_total",
+                documentation="Number of spec decoding drafts.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_draft_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_draft_tokens_total",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_accepted_tokens_total",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+
+        assert speculative_config is not None
+        num_spec_tokens = (speculative_config.num_speculative_tokens
+                           if self.spec_decoding_enabled else 0)
+        pos_labelnames = labelnames + ["position"]
+        base_counter = prometheus_client.Counter(
+            name="vllm:spec_decode_num_accepted_tokens_per_pos",
+            documentation="Accepted tokens per draft position.",
+            labelnames=pos_labelnames)
+        self.counter_spec_decode_num_accepted_tokens_per_pos: \
+            list[prometheus_client.Counter] = []
+        for pos in range(num_spec_tokens):
+            pos_labelvalues = labelvalues + [str(pos)]
+            self.counter_spec_decode_num_accepted_tokens_per_pos.append(
+                base_counter.labels(*pos_labelvalues))
+
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        if not self.spec_decoding_enabled:
+            return
+        self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts)
+        self.counter_spec_decode_num_draft_tokens.inc(
+            spec_decoding_stats.num_draft_tokens)
+        self.counter_spec_decode_num_accepted_tokens.inc(
+            spec_decoding_stats.num_accepted_tokens)
+        for pos, counter in enumerate(
+                self.counter_spec_decode_num_accepted_tokens_per_pos):
+            counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])

From 4115f19958d8b3628606d78355c277b328f011e1 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 24 Apr 2025 12:22:00 -0400
Subject: [PATCH 51/81] [CI] Add automation for the `tool-calling` github label
 (#17118)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/mergify.yml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 2033722b5f33..15fa3660a87d 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -126,6 +126,28 @@ pull_request_rules:
       remove:
         - tpu
 
+- name: label-tool-calling
+  description: Automatically add tool-calling label
+  conditions:
+    - or:
+      - files~=^tests/tool_use/
+      - files~=^tests/mistral_tool_use/
+      - files~=^tests/entrypoints/openai/tool_parsers/
+      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+      - files~=^vllm/entrypoints/openai/tool_parsers/
+      - files=docs/source/features/tool_calling.md
+      - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
+      - files=docs/source/getting_started/examples/chat_with_tools.md
+      - files~=^examples/tool_chat_*
+      - files=examples/offline_inference/chat_with_tools.py
+      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
+      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
+  actions:
+    label:
+      add:
+        - tool-calling
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From 5adf6f6b7f07b53d1beff47975528b6e637c0470 Mon Sep 17 00:00:00 2001
From: Aaruni Aggarwal <47731267+AaruniAggarwal@users.noreply.github.com>
Date: Thu, 24 Apr 2025 22:36:17 +0530
Subject: [PATCH 52/81] Updating builkite job for IBM Power  (#17111)

Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
---
 .../scripts/hardware_ci/run-cpu-test-ppc64le.sh   | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 036cfea9431c..5d863dd82e9b 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -5,7 +5,12 @@
 set -ex
 
 # Setup cleanup
-remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
+remove_docker_container() {
+  if [[ -n "$container_id" ]]; then
+      podman rm -f "$container_id" || true
+  fi
+  podman system prune -f
+}
 trap remove_docker_container EXIT
 remove_docker_container
 
@@ -13,17 +18,17 @@ remove_docker_container
 podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 
 # Run the image
-podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
+container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
 
 function cpu_tests() {
 
   # offline inference
-  podman exec cpu-test-ubi9-ppc bash -c "
+  podman exec -it "$container_id" bash -c "
     set -e
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run basic model test
-  podman exec cpu-test-ubi9-ppc bash -c "
+  podman exec -it "$container_id" bash -c "
     set -e
     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
     pip install sentence-transformers datamodel_code_generator
@@ -33,6 +38,8 @@ function cpu_tests() {
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
+
+export container_id
 export -f cpu_tests
 timeout 40m bash -c cpu_tests
 

From 49f189439d17f7f7614cc1db03f97089d5579d92 Mon Sep 17 00:00:00 2001
From: Atilla <48064466+atilla00@users.noreply.github.com>
Date: Thu, 24 Apr 2025 20:07:21 +0300
Subject: [PATCH 53/81] existing torch installation pip command fix for docs
 (#17059)

---
 docs/source/getting_started/installation/gpu/cuda.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index cd39d6376fe3..46bdb08ebb77 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -153,7 +153,7 @@ git clone https://github.com/vllm-project/vllm.git
 cd vllm
 python use_existing_torch.py
 pip install -r requirements/build.txt
-pip install -e . --no-build-isolation
+pip install --no-build-isolation -e .
 ```
 
 ##### Use the local cutlass for compilation

From 47bdee409c92c9be60ce1ad14fa6135ec691ea90 Mon Sep 17 00:00:00 2001
From: Eyshika Agarwal <eyshikaengineer@gmail.com>
Date: Thu, 24 Apr 2025 12:08:37 -0500
Subject: [PATCH 54/81] Molmo Requirements (#17026)

Signed-off-by: Eyshika Agarwal <eyshikaengineer@gmail.com>
Signed-off-by: eyshika <eyshikaengineer@gmail.com>
---
 docs/source/models/supported_models.md |  4 ++++
 requirements/molmo.txt                 | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 requirements/molmo.txt

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 0fad1b1ef276..01427f4f2ba4 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1111,6 +1111,10 @@ This limitation exists because the model's mixed attention pattern (bidirectiona
 To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
 :::
 
+:::{warning}
+For improved output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks), we recommend using the pinned dependency versions listed in <gh-file:requirements/molmo.txt> (including `vllm==0.7.0`). These versions match the environment that achieved consistent results on both A10 and L40 GPUs.
+:::
+
 :::{note}
 The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
diff --git a/requirements/molmo.txt b/requirements/molmo.txt
new file mode 100644
index 000000000000..8450e29b6e7d
--- /dev/null
+++ b/requirements/molmo.txt
@@ -0,0 +1,20 @@
+# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
+torch==2.5.1
+torchvision==0.20.1
+transformers==4.48.1
+tokenizers==0.21.0
+tiktoken==0.7.0
+vllm==0.7.0
+
+# Optional but recommended for improved performance and stability
+triton==3.1.0
+xformers==0.0.28.post3
+uvloop==0.21.0
+protobuf==5.29.3
+openai==1.60.2
+opencv-python-headless==4.11.0.86
+pillow==10.4.0
+
+# Installed FlashAttention (for float16 only)
+flash-attn>=2.5.6  # Not used in float32, but should be documented
+

From 0422ce109f3b4d80df073a542bd23d2c4b060873 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 24 Apr 2025 18:28:45 +0100
Subject: [PATCH 55/81] Add `:markdownhelp:` to `EngineArgs` docs so markdown
 docstrings render properly (#17124)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/source/serving/engine_args.md | 2 ++
 requirements/docs.txt              | 1 +
 2 files changed, 3 insertions(+)

diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
index e9943571a40a..97ea01cd3b2e 100644
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@@ -16,6 +16,7 @@ Below, you can find an explanation of every engine argument:
     :func: _engine_args_parser
     :prog: vllm serve
     :nodefaultconst:
+    :markdownhelp:
 ```
 
 ## Async Engine Arguments
@@ -29,4 +30,5 @@ Additional arguments are available to the asynchronous engine which is used for
     :func: _async_engine_args_parser
     :prog: vllm serve
     :nodefaultconst:
+    :markdownhelp:
 ```
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 99fb87def6dd..d84fd633ce10 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -7,6 +7,7 @@ sphinx-togglebutton==0.3.2
 myst-parser==3.0.1
 msgspec
 cloudpickle
+commonmark # Required by sphinx-argparse when using :markdownhelp:
 
 # packages to install to build the documentation
 cachetools

From 0fa939e2d154d42e9722e51d0d004eabfc0b5e6f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 24 Apr 2025 18:29:34 +0100
Subject: [PATCH 56/81] Improve configs - `LoRAConfig` + `PromptAdapterConfig`
 (#16980)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/lora/test_lora_manager.py |  30 +++++--
 vllm/config.py                  |  46 ++++++++--
 vllm/engine/arg_utils.py        | 145 +++++++++++++++-----------------
 3 files changed, 130 insertions(+), 91 deletions(-)

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 576d95a47154..52b0834cacb8 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -31,6 +31,8 @@
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
+DEFAULT_DTYPE = torch.get_default_dtype()
+
 
 @pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch: pytest.MonkeyPatch):
@@ -125,8 +127,10 @@ def test_replace_submodules(dist_init, dummy_model):
     model = dummy_model
     manager = LoRAModelManager(
         model, 1, 1, 1,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
-        torch.device(DEVICES[0]))
+        LoRAConfig(max_lora_rank=8,
+                   max_cpu_loras=8,
+                   max_loras=8,
+                   lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
     model = manager.model
     assert isinstance(model.get_submodule("dense1"),
                       ColumnParallelLinearWithLoRA)
@@ -155,7 +159,8 @@ def test_lora_model_manager(dist_init, dummy_model, device):
                                2,
                                LoRAConfig(max_lora_rank=8,
                                           max_cpu_loras=3,
-                                          max_loras=2),
+                                          max_loras=2,
+                                          lora_dtype=DEFAULT_DTYPE),
                                device=device)
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
@@ -221,7 +226,8 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
                                        2,
                                        LoRAConfig(max_lora_rank=8,
                                                   max_cpu_loras=3,
-                                                  max_loras=2),
+                                                  max_loras=2,
+                                                  lora_dtype=DEFAULT_DTYPE),
                                        device=device)
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
@@ -316,7 +322,8 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
                                        2,
                                        LoRAConfig(max_lora_rank=8,
                                                   max_cpu_loras=2,
-                                                  max_loras=2),
+                                                  max_loras=2,
+                                                  lora_dtype=DEFAULT_DTYPE),
                                        device=device)
 
     assert all(x is None for x in manager.lora_index_to_id)
@@ -424,7 +431,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                           sql_lora_files, device):
-    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    lora_config = LoRAConfig(max_lora_rank=8,
+                             max_cpu_loras=4,
+                             max_loras=4,
+                             lora_dtype=DEFAULT_DTYPE)
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, device,
@@ -504,7 +514,10 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                 sql_lora_files, device):
     # Should remove every LoRA not specified in the request.
-    lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
+    lora_config = LoRAConfig(max_lora_rank=8,
+                             max_cpu_loras=4,
+                             max_loras=4,
+                             lora_dtype=DEFAULT_DTYPE)
     worker_adapter_manager = WorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
         lora_config.lora_extra_vocab_size, lora_config, device,
@@ -600,7 +613,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
                                2,
                                LoRAConfig(max_lora_rank=8,
                                           max_cpu_loras=2,
-                                          max_loras=2),
+                                          max_loras=2,
+                                          lora_dtype=DEFAULT_DTYPE),
                                device=device)
     model = manager.model
 
diff --git a/vllm/config.py b/vllm/config.py
index 3e5a17802f0f..41a30efea039 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2565,18 +2565,41 @@ def __repr__(self) -> str:
         return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 
 
+LoRADType = Literal["auto", "float16", "bfloat16"]
+
+
+@config
 @dataclass
 class LoRAConfig:
-    max_lora_rank: int
-    max_loras: int
+    """Configuration for LoRA."""
+
+    max_lora_rank: int = 16
+    """Max LoRA rank."""
+    max_loras: int = 1
+    """Max number of LoRAs in a single batch."""
     fully_sharded_loras: bool = False
+    """By default, only half of the LoRA computation is sharded with tensor
+    parallelism. Enabling this will use the fully sharded layers. At high
+    sequence length, max rank or tensor parallel size, this is likely faster.
+    """
     max_cpu_loras: Optional[int] = None
-    lora_dtype: Optional[Union[torch.dtype, str]] = None
+    """Maximum number of LoRAs to store in CPU memory. Must be >= than
+    `max_loras`."""
+    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
+    """Data type for LoRA. If auto, will default to base model dtype."""
     lora_extra_vocab_size: int = 256
+    """Maximum size of extra vocabulary that can be present in a LoRA adapter
+    (added to the base model vocabulary)."""
     # This is a constant.
     lora_vocab_padding_size: ClassVar[int] = 256
-    long_lora_scaling_factors: Optional[tuple[float]] = None
+    long_lora_scaling_factors: Optional[tuple[float, ...]] = None
+    """Specify multiple scaling factors (which can be different from base model
+    scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters
+    trained with those scaling factors to be used at the same time. If not
+    specified, only adapters trained with the base model scaling factor are
+    allowed."""
     bias_enabled: bool = False
+    """Enable bias for LoRA adapters."""
 
     def compute_hash(self) -> str:
         """
@@ -2641,12 +2664,19 @@ def verify_lora_support(self):
                 "V1 LoRA does not support long LoRA, please use V0.")
 
 
+@config
 @dataclass
 class PromptAdapterConfig:
-    max_prompt_adapters: int
-    max_prompt_adapter_token: int
+    max_prompt_adapters: int = 1
+    """Max number of PromptAdapters in a batch."""
+    max_prompt_adapter_token: int = 0
+    """Max number of PromptAdapters tokens."""
     max_cpu_prompt_adapters: Optional[int] = None
-    prompt_adapter_dtype: Optional[torch.dtype] = None
+    """Maximum number of PromptAdapters to store in CPU memory. Must be >= than
+    `max_prompt_adapters`."""
+    prompt_adapter_dtype: Union[torch.dtype, str] = "auto"
+    """Data type for PromptAdapter. If auto, will default to base model dtype.
+    """
 
     def compute_hash(self) -> str:
         """
@@ -2678,7 +2708,7 @@ def __post_init__(self):
             self.max_cpu_prompt_adapters = self.max_prompt_adapters
 
     def verify_with_model_config(self, model_config: ModelConfig):
-        if self.prompt_adapter_dtype in (None, "auto"):
+        if self.prompt_adapter_dtype == "auto":
             self.prompt_adapter_dtype = model_config.dtype
         elif isinstance(self.prompt_adapter_dtype, str):
             self.prompt_adapter_dtype = getattr(torch,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6d6b5ac02b14..9cb2aa797be5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -7,7 +7,7 @@
 import re
 import threading
 from dataclasses import MISSING, dataclass, fields
-from typing import (Any, Callable, Dict, List, Literal, Optional, Tuple, Type,
+from typing import (Any, Callable, Dict, List, Literal, Optional, Type,
                     TypeVar, Union, cast, get_args, get_origin)
 
 import torch
@@ -192,18 +192,23 @@ class EngineArgs:
         get_field(MultiModalConfig, "limit_per_prompt")
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
     disable_mm_preprocessor_cache: bool = False
+    # LoRA fields
     enable_lora: bool = False
-    enable_lora_bias: bool = False
-    max_loras: int = 1
-    max_lora_rank: int = 16
+    enable_lora_bias: bool = LoRAConfig.bias_enabled
+    max_loras: int = LoRAConfig.max_loras
+    max_lora_rank: int = LoRAConfig.max_lora_rank
+    fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
+    max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras
+    lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
+    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
+    long_lora_scaling_factors: Optional[tuple[float, ...]] = \
+        LoRAConfig.long_lora_scaling_factors
+    # PromptAdapter fields
     enable_prompt_adapter: bool = False
-    max_prompt_adapters: int = 1
-    max_prompt_adapter_token: int = 0
-    fully_sharded_loras: bool = False
-    lora_extra_vocab_size: int = 256
-    long_lora_scaling_factors: Optional[Tuple[float]] = None
-    lora_dtype: Optional[Union[str, torch.dtype]] = 'auto'
-    max_cpu_loras: Optional[int] = None
+    max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters
+    max_prompt_adapter_token: int = \
+        PromptAdapterConfig.max_prompt_adapter_token
+
     device: Device = DeviceConfig.device
     num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
     multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
@@ -338,10 +343,21 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
                     kwargs[name]["choices"] = choices
                     choice_type = type(choices[0])
                     assert all(type(c) is choice_type for c in choices), (
-                        f"All choices must be of the same type. "
+                        "All choices must be of the same type. "
                         f"Got {choices} with types {[type(c) for c in choices]}"
                     )
                     kwargs[name]["type"] = choice_type
+                elif can_be_type(field_type, tuple):
+                    if is_type_in_union(field_type, tuple):
+                        field_type = get_type_from_union(field_type, tuple)
+                    dtypes = get_args(field_type)
+                    dtype = dtypes[0]
+                    assert all(
+                        d is dtype for d in dtypes if d is not Ellipsis
+                    ), ("All non-Ellipsis tuple elements must be of the same "
+                        f"type. Got {dtypes}.")
+                    kwargs[name]["type"] = dtype
+                    kwargs[name]["nargs"] = "+"
                 elif can_be_type(field_type, int):
                     kwargs[name]["type"] = optional_int if optional else int
                 elif can_be_type(field_type, float):
@@ -685,70 +701,49 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
             'inputs.')
 
         # LoRA related configs
-        parser.add_argument('--enable-lora',
-                            action='store_true',
-                            help='If True, enable handling of LoRA adapters.')
-        parser.add_argument('--enable-lora-bias',
-                            action='store_true',
-                            help='If True, enable bias for LoRA adapters.')
-        parser.add_argument('--max-loras',
-                            type=int,
-                            default=EngineArgs.max_loras,
-                            help='Max number of LoRAs in a single batch.')
-        parser.add_argument('--max-lora-rank',
-                            type=int,
-                            default=EngineArgs.max_lora_rank,
-                            help='Max LoRA rank.')
-        parser.add_argument(
-            '--lora-extra-vocab-size',
-            type=int,
-            default=EngineArgs.lora_extra_vocab_size,
-            help=('Maximum size of extra vocabulary that can be '
-                  'present in a LoRA adapter (added to the base '
-                  'model vocabulary).'))
-        parser.add_argument(
+        lora_kwargs = get_kwargs(LoRAConfig)
+        lora_group = parser.add_argument_group(
+            title="LoRAConfig",
+            description=LoRAConfig.__doc__,
+        )
+        lora_group.add_argument(
+            '--enable-lora',
+            action=argparse.BooleanOptionalAction,
+            help='If True, enable handling of LoRA adapters.')
+        lora_group.add_argument('--enable-lora-bias',
+                                **lora_kwargs["bias_enabled"])
+        lora_group.add_argument('--max-loras', **lora_kwargs["max_loras"])
+        lora_group.add_argument('--max-lora-rank',
+                                **lora_kwargs["max_lora_rank"])
+        lora_group.add_argument('--lora-extra-vocab-size',
+                                **lora_kwargs["lora_extra_vocab_size"])
+        lora_group.add_argument(
             '--lora-dtype',
-            type=str,
-            default=EngineArgs.lora_dtype,
-            choices=['auto', 'float16', 'bfloat16'],
-            help=('Data type for LoRA. If auto, will default to '
-                  'base model dtype.'))
-        parser.add_argument(
-            '--long-lora-scaling-factors',
-            type=optional_str,
-            default=EngineArgs.long_lora_scaling_factors,
-            help=('Specify multiple scaling factors (which can '
-                  'be different from base model scaling factor '
-                  '- see eg. Long LoRA) to allow for multiple '
-                  'LoRA adapters trained with those scaling '
-                  'factors to be used at the same time. If not '
-                  'specified, only adapters trained with the '
-                  'base model scaling factor are allowed.'))
-        parser.add_argument(
-            '--max-cpu-loras',
-            type=int,
-            default=EngineArgs.max_cpu_loras,
-            help=('Maximum number of LoRAs to store in CPU memory. '
-                  'Must be >= than max_loras.'))
-        parser.add_argument(
-            '--fully-sharded-loras',
-            action='store_true',
-            help=('By default, only half of the LoRA computation is '
-                  'sharded with tensor parallelism. '
-                  'Enabling this will use the fully sharded layers. '
-                  'At high sequence length, max rank or '
-                  'tensor parallel size, this is likely faster.'))
-        parser.add_argument('--enable-prompt-adapter',
-                            action='store_true',
-                            help='If True, enable handling of PromptAdapters.')
-        parser.add_argument('--max-prompt-adapters',
-                            type=int,
-                            default=EngineArgs.max_prompt_adapters,
-                            help='Max number of PromptAdapters in a batch.')
-        parser.add_argument('--max-prompt-adapter-token',
-                            type=int,
-                            default=EngineArgs.max_prompt_adapter_token,
-                            help='Max number of PromptAdapters tokens')
+            **lora_kwargs["lora_dtype"],
+        )
+        lora_group.add_argument('--long-lora-scaling-factors',
+                                **lora_kwargs["long_lora_scaling_factors"])
+        lora_group.add_argument('--max-cpu-loras',
+                                **lora_kwargs["max_cpu_loras"])
+        lora_group.add_argument('--fully-sharded-loras',
+                                **lora_kwargs["fully_sharded_loras"])
+
+        # PromptAdapter related configs
+        prompt_adapter_kwargs = get_kwargs(PromptAdapterConfig)
+        prompt_adapter_group = parser.add_argument_group(
+            title="PromptAdapterConfig",
+            description=PromptAdapterConfig.__doc__,
+        )
+        prompt_adapter_group.add_argument(
+            '--enable-prompt-adapter',
+            action=argparse.BooleanOptionalAction,
+            help='If True, enable handling of PromptAdapters.')
+        prompt_adapter_group.add_argument(
+            '--max-prompt-adapters',
+            **prompt_adapter_kwargs["max_prompt_adapters"])
+        prompt_adapter_group.add_argument(
+            '--max-prompt-adapter-token',
+            **prompt_adapter_kwargs["max_prompt_adapter_token"])
 
         # Device arguments
         device_kwargs = get_kwargs(DeviceConfig)

From 6d0df0ebebd4e347e1ebcdea4be010a4b54b901b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 24 Apr 2025 13:39:43 -0400
Subject: [PATCH 57/81] [Docs] Generate correct github links for decorated
 functions (#17125)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 docs/source/conf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index a83ad764125c..c2ad6f9fa3a5 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -177,6 +177,11 @@ def linkcode_resolve(domain, info):
         for part in info['fullname'].split('.'):
             obj = getattr(obj, part)
 
+            # Skip decorator wrappers by checking if the object is a function
+            # and has a __wrapped__ attribute (which decorators typically set)
+            while hasattr(obj, '__wrapped__'):
+                obj = obj.__wrapped__
+
             if not (inspect.isclass(obj) or inspect.isfunction(obj)
                     or inspect.ismethod(obj)):
                 obj = obj.__class__  # Get the class of the instance

From fe921763212b881d9629d04c2eaab4496e136fa5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@thinkingmachines.ai>
Date: Thu, 24 Apr 2025 13:16:52 -0700
Subject: [PATCH 58/81] Add collective_rpc to llm engine (#16999)

Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
---
 vllm/engine/async_llm_engine.py | 18 ++++++++++++++++++
 vllm/v1/engine/async_llm.py     | 11 +++++++++++
 2 files changed, 29 insertions(+)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index ca8fd83314ae..ba0cdee461a2 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -528,6 +528,13 @@ async def add_request_async(
     async def check_health_async(self) -> None:
         self.model_executor.check_health()
 
+    async def collective_rpc_async(self,
+                                   method: str,
+                                   timeout: Optional[float] = None,
+                                   args: tuple = (),
+                                   kwargs: Optional[dict] = None):
+        raise NotImplementedError
+
 
 async def build_guided_decoding_logits_processor_async(
         sampling_params: SamplingParams, tokenizer: AnyTokenizer,
@@ -1236,6 +1243,17 @@ async def is_sleeping(self) -> bool:
     async def add_lora(self, lora_request: LoRARequest) -> None:
         self.engine.add_lora(lora_request)
 
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """
+        Perform a collective RPC call to the given path.
+        """
+        return await self.engine.collective_rpc_async(method, timeout, args,
+                                                      kwargs)
+
 
 # TODO(v1): Remove this class proxy when V1 goes default.
 if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index fdda812c2853..54f0232da2b2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -492,6 +492,17 @@ async def pin_lora(self, lora_id: int) -> bool:
         """Prevent an adapter from being evicted."""
         return await self.engine_core.pin_lora_async(lora_id)
 
+    async def collective_rpc(self,
+                             method: str,
+                             timeout: Optional[float] = None,
+                             args: tuple = (),
+                             kwargs: Optional[dict] = None):
+        """
+        Perform a collective RPC call to the given path.
+        """
+        return await self.engine_core.collective_rpc_async(
+            method, timeout, args, kwargs)
+
     @property
     def is_running(self) -> bool:
         # Is None before the loop is started.

From 05e1fbfc52ca575e6539de63dbb5fab929683162 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Thu, 24 Apr 2025 17:19:36 -0300
Subject: [PATCH 59/81] Add chat template for Llama 4 models (#16428)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 docs/source/features/tool_calling.md          |   9 +-
 examples/tool_chat_template_llama4_json.jinja | 116 ++++++++++++++++++
 tests/tool_use/utils.py                       |  14 +++
 .../openai/tool_parsers/llama_tool_parser.py  |   1 +
 4 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 examples/tool_chat_template_llama4_json.jinja

diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 57888e122969..f98ec6108cea 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -152,10 +152,11 @@ Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_cha
 
 Supported models:
 
-All Llama 3.1 and 3.2 models should be supported.
+All Llama 3.1, 3.2 and 4 models should be supported.
 
 * `meta-llama/Llama-3.1-*`
 * `meta-llama/Llama-3.2-*`
+* `meta-llama/Llama-4-*`
 
 The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below.
 
@@ -176,6 +177,12 @@ images.
 
 Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
 
+VLLM also provides a JSON based chat template for Llama 4:
+* `examples/tool_chat_template_llama4_json.jinja` - this is based on the "official" chat template for the Llama 4
+models, but tweaked so that it works better with vLLM.
+
+For Llama 4 use `--tool-call-parser llama4_json examples/tool_chat_template_llama4_json.jinja`.
+
 #### IBM Granite
 
 Supported models:
diff --git a/examples/tool_chat_template_llama4_json.jinja b/examples/tool_chat_template_llama4_json.jinja
new file mode 100644
index 000000000000..759f16554436
--- /dev/null
+++ b/examples/tool_chat_template_llama4_json.jinja
@@ -0,0 +1,116 @@
+{%- macro is_array_of_type_objects(var) -%}
+    {%- if var is iterable and var is not string -%}
+        {%- set valid = true -%}
+        {%- for item in var -%}
+            {%- if 'type' not in item -%}
+                {%- set valid = false -%}
+                {%- break -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ valid }}
+    {%- else -%}
+        {{ false }}
+    {%- endif -%}
+{%- endmacro %}
+
+{%- macro render_message(message) %}
+    {%- if message['content'] is string %}
+        {{- message['content']|trim }}
+    {%- elif is_array_of_type_objects(data) == 'True' %}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<|image|>' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text']|trim }}
+            {%- endif %}
+        {%- endfor %}
+    {%- else %}
+        {{- message['content']|tojson }}
+    {%- endif %}
+{%- endmacro %}
+
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0] %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
+        "capabilities. Only reply with a tool call if the function exists in the "
+        "library provided by the user. If it doesn't exist, just reply directly in "
+        "natural language. When you receive a tool call response, use the output to "
+        "format an answer to the original user question."}) %}
+{%- endif %}
+
+{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
+    'or more function/tool calls to fulfill the task. \n'
+    'If none are needed, then proceed to the response.\n\n'
+    'Tool Call Syntax: You can call tools using the following syntax:\n'
+    '{"name": function name, "parameters": dictionary of argument name and its value}.\n'
+    'Separate multiple function calls by "; ". Do not use variables.\n'
+    'Do not include anything else when calling the tools with the syntax above.\n\n'
+    'Here is a list of functions in JSON format that you can invoke.\n' %}
+
+{{- "<|header_start|>system<|header_end|>\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- tool_lib_preamble }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- render_message(system_message) }}
+{{ "<|eot|>\n" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0] %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|header_start|>user<|header_end|>\n\n' }}
+    {{- tool_lib_preamble }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- render_message(first_user_message) + "\n<|eot|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+        {{- render_message(message) }}
+        {{- "\n<|eot|>" }}
+    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
+        {{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
+        {{- render_message(message) }}
+        {%- for tool_call in message.tool_calls %}
+           {{- '{"name": "' + tool_call.function.name + '", ' }}
+           {{- '"parameters": ' }}
+           {{- tool_call.function.arguments | tojson }}
+           {{- "}" }}
+        {%- endfor %}
+       {{- "\n<|eot|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
+        {{- render_message(message) }}
+        {{- "\n<|eom|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
+{%- endif %}
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 7c87c73f04da..c14eaf71e978 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -98,6 +98,20 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "extended":
         True
     },
+    "llama4_json": {
+        "model":
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
+            "--distributed-executor-backend", "mp", "--tool-call-parser",
+            "llama4_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama4_json.jinja")
+        ],
+        "supports_parallel":
+        True,
+        "extended":
+        True
+    },
     "mistral": {
         "model":
         "mistralai/Mistral-7B-Instruct-v0.3",
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 20c3238fb3df..5c181616aa01 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -27,6 +27,7 @@
 
 
 @ToolParserManager.register_module("llama3_json")
+@ToolParserManager.register_module("llama4_json")
 class Llama3JsonToolParser(ToolParser):
     """
     Tool call parser for Llama 3.1 models intended for use with the

From 583e9009964b645133022c4efa6688c32508e675 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Thu, 24 Apr 2025 15:25:21 -0700
Subject: [PATCH 60/81] [Misc] Add example to run DeepSeek with Ray Serve LLM
 (#17134)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
---
 examples/online_serving/ray_serve_deepseek.py | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 examples/online_serving/ray_serve_deepseek.py

diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
new file mode 100644
index 000000000000..7f1507dd546a
--- /dev/null
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
+See Ray Serve LLM documentation at:
+https://docs.ray.io/en/latest/serve/llm/serving-llms.html
+
+Run `python3 ray_serve_deepseek.py` to deploy the model.
+"""
+
+from ray import serve
+from ray.serve.llm import LLMConfig, LLMRouter, LLMServer
+
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="deepseek",
+        # Change to model download path
+        model_source="/path/to/the/model",
+    ),
+    deployment_config=dict(autoscaling_config=dict(
+        min_replicas=1,
+        max_replicas=1,
+    )),
+    # Change to the accelerator type of the node
+    accelerator_type="H100",
+    runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")),
+    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
+    engine_kwargs=dict(
+        tensor_parallel_size=8,
+        pipeline_parallel_size=2,
+        gpu_memory_utilization=0.92,
+        dtype="auto",
+        max_num_seqs=40,
+        max_model_len=16384,
+        enable_chunked_prefill=True,
+        enable_prefix_caching=True,
+        trust_remote_code=True,
+    ),
+)
+
+# Deploy the application
+deployment = LLMServer.as_deployment(
+    llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config)
+llm_app = LLMRouter.as_deployment().bind([deployment])
+serve.run(llm_app)

From 9420a1fc30af1a632bbc2c66eb8668f3af41f026 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 24 Apr 2025 17:43:08 -0600
Subject: [PATCH 61/81] Better error message for missing mistral params.json
 (#17132)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 vllm/transformers_utils/config.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index c9f9af45044e..ee991eaeb34a 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -650,6 +650,11 @@ def load_params_config(model: Union[str, Path], revision: Optional[str],
     config_file_name = "params.json"
 
     config_dict = get_hf_file_to_dict(config_file_name, model, revision)
+    if config_dict is None:
+        raise ValueError(
+            f"Failed to load mistral '{config_file_name}' config for model "
+            f"{model}. Please check if the model is a mistral-format model "
+            f"and if the config file exists.")
     assert isinstance(config_dict, dict)
 
     config_mapping = {

From 0d6e187e88874c39cda7409cf673f9e6546893e7 Mon Sep 17 00:00:00 2001
From: jglaser <glaserj@ornl.gov>
Date: Thu, 24 Apr 2025 21:57:16 -0400
Subject: [PATCH 62/81] Use custom address for listening socket (#15988)

Signed-off-by: Jens Glaser <glaserj@ornl.gov>
---
 vllm/distributed/utils.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 2cb57afd4566..e4d4008cd0a6 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -7,6 +7,7 @@
 import dataclasses
 import datetime
 import pickle
+import socket
 import time
 from collections import deque
 from typing import Any, Deque, Dict, Optional, Sequence, Tuple
@@ -123,6 +124,10 @@ class StatelessProcessGroup:
     rank: int
     world_size: int
     store: torch._C._distributed_c10d.Store
+
+    # stores a reference to the socket so that the file descriptor stays alive
+    socket: Optional[socket.socket]
+
     data_expiration_seconds: int = 3600  # 1 hour
 
     # dst rank -> counter
@@ -234,18 +239,33 @@ def create(
         can call `StatelessProcessGroup.create` to form a group, and then process A, B,
         C, and D can call `StatelessProcessGroup.create` to form another group.
         """ # noqa
+        launch_server = rank == 0
+        if launch_server:
+            # listen on the specified interface (instead of 0.0.0.0)
+            listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            listen_socket.bind((host, port))
+            listen_socket.listen()
+            listen_fd = listen_socket.fileno()
+        else:
+            listen_socket = None
+            listen_fd = None
+
         store = TCPStore(
             host_name=host,
             port=port,
             world_size=world_size,
-            is_master=(rank == 0),
+            is_master=launch_server,
             timeout=datetime.timedelta(seconds=store_timeout),
+            use_libuv=False,  # for now: github.com/pytorch/pytorch/pull/150215
+            master_listen_fd=listen_fd,
         )
 
         return StatelessProcessGroup(
             rank=rank,
             world_size=world_size,
             store=store,
+            socket=listen_socket,
             data_expiration_seconds=data_expiration_seconds)
 
 

From eef364723c1850f62942d4d832c8f8a2eb31988c Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 25 Apr 2025 11:06:50 +0800
Subject: [PATCH 63/81] [FEAT] [ROCm]: AITER Fused MOE V1 Support (#16752)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .../model_executor/test_enabled_custom_ops.py |   7 +-
 .../layers/fused_moe/rocm_aiter_fused_moe.py  | 427 ++++++++++++------
 .../compressed_tensors_moe.py                 |   6 +-
 3 files changed, 306 insertions(+), 134 deletions(-)

diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index ac2e0f3542e7..2d9cf1d48fd5 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -11,6 +11,8 @@
     dispatch_fused_experts_func, dispatch_topk_func,
     torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts,
     vllm_topk_softmax)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.layernorm import (
     RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
     rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
@@ -100,11 +102,10 @@ def test_enabled_ops_invalid(env: str):
 def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
     topk_func = dispatch_topk_func()
-
+    is_rocm_aiter_moe_enabled.cache_clear()
     if current_platform.is_rocm() and int(use_rocm_aiter):
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
             rocm_aiter_topk_softmax)
-
         assert topk_func == rocm_aiter_topk_softmax
     else:
         assert topk_func == vllm_topk_softmax
@@ -116,11 +117,11 @@ def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
                                 monkeypatch):
 
     monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    is_rocm_aiter_moe_enabled.cache_clear()
     fused_experts_func = dispatch_fused_experts_func(inplace)
     if current_platform.is_rocm() and int(use_rocm_aiter):
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
             rocm_aiter_fused_experts)
-
         assert fused_experts_func == rocm_aiter_fused_experts
     elif inplace:
         assert fused_experts_func == torch_vllm_inplace_fused_experts
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 1315dcead49d..acaa93f5a23e 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,31 +1,35 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional
+from functools import cache
+from typing import List, Optional, Tuple
 
 import torch
 
-import vllm.envs as envs
+from vllm import envs
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 
+@cache
 def is_rocm_aiter_moe_enabled() -> bool:
     return current_platform.is_rocm() \
         and envs.VLLM_ROCM_USE_AITER_MOE \
         and envs.VLLM_ROCM_USE_AITER
 
 
-def rocm_aiter_asm_moe_tkw1(hidden_states,
-                            w1,
-                            w2,
-                            topk_weight,
-                            topk_ids,
-                            fc1_scale=None,
-                            fc2_scale=None,
-                            fc1_smooth_scale=None,
-                            fc2_smooth_scale=None,
-                            a16=False,
-                            per_tensor_quant_scale=None,
-                            expert_mask=None,
-                            activation_str: str = "silu") -> None:
+def rocm_aiter_asm_moe_tkw1_impl(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weight: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: Optional[torch.Tensor] = None,
+        fc2_scale: Optional[torch.Tensor] = None,
+        fc1_smooth_scale: Optional[torch.Tensor] = None,
+        fc2_smooth_scale: Optional[torch.Tensor] = None,
+        a16: bool = False,
+        per_tensor_quant_scale: Optional[torch.Tensor] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        activation_str: str = "silu") -> torch.Tensor:
 
     from aiter import ActivationType
     from aiter.fused_moe_bf16_asm import asm_moe_tkw1
@@ -48,34 +52,236 @@ def rocm_aiter_asm_moe_tkw1(hidden_states,
                         activation=activation)
 
 
-def rocm_aiter_fused_experts(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    inplace: bool = False,
-    activation: str = "silu",
-    apply_router_weight_on_input: bool = False,
-    use_fp8_w8a8: bool = False,
-    use_int8_w8a8: bool = False,
-    use_int8_w8a16: bool = False,
-    use_int4_w4a16: bool = False,
-    per_channel_quant: bool = False,
-    global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    w1_zp: Optional[torch.Tensor] = None,
-    w2_zp: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[List[int]] = None,
-    allow_deep_gemm: bool = False,
-) -> torch.Tensor:
-
-    import aiter as rocm_aiter
+def rocm_aiter_asm_moe_tkw1_fake(
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weight: torch.Tensor,
+        topk_ids: torch.Tensor,
+        fc1_scale: Optional[torch.Tensor] = None,
+        fc2_scale: Optional[torch.Tensor] = None,
+        fc1_smooth_scale: Optional[torch.Tensor] = None,
+        fc2_smooth_scale: Optional[torch.Tensor] = None,
+        a16: bool = False,
+        per_tensor_quant_scale: Optional[torch.Tensor] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        activation_str: str = "silu") -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def rocm_aiter_ck_moe_impl(hidden_states: torch.Tensor, w1: torch.Tensor,
+                           w2: torch.Tensor, topk_weights: torch.Tensor,
+                           topk_ids: torch.Tensor) -> torch.Tensor:
+    from aiter import ck_moe
+    return ck_moe(hidden_states=hidden_states,
+                  w1=w1,
+                  w2=w2,
+                  topk_weights=topk_weights,
+                  topk_ids=topk_ids)
+
+
+def rocm_aiter_ck_moe_fake(hidden_states: torch.Tensor, w1: torch.Tensor,
+                           w2: torch.Tensor, topk_weights: torch.Tensor,
+                           topk_ids: torch.Tensor) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl(
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        hidden_states_dtype: torch.dtype,
+        expert_mask: torch.Tensor,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a1_scale: torch.Tensor,
+        block_shape: List[int],
+        smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+    from aiter import fmoe_fp8_blockscale_g1u1
+    from aiter.fused_moe_bf16_asm import moe_sorting_ck
+
+    topk = topk_ids.shape[1]
+    model_dim = w1.shape[-1]
+    local_E = E = w1.shape[0]
+    if expert_mask is not None:
+        E = expert_mask.numel()
+
+    (
+        sorted_token_ids,
+        sorted_weight_buf,
+        sorted_expert_ids,
+        num_valid_ids,
+        out_asm,
+    ) = moe_sorting_ck(topk_ids,
+                       topk_weights,
+                       E,
+                       model_dim,
+                       hidden_states_dtype,
+                       expert_mask=expert_mask)
+
+    fmoe_fp8_blockscale_g1u1(out_asm, a1, w1, w2, sorted_token_ids,
+                             sorted_weight_buf, sorted_expert_ids,
+                             num_valid_ids, topk, w1_scale.view(local_E, -1),
+                             w2_scale.view(local_E, -1),
+                             a1_scale.t().contiguous(), *block_shape,
+                             smooth_scale)
+
+    return out_asm
+
+
+def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake(
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        hidden_states_dtype: torch.dtype,
+        expert_mask: torch.Tensor,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        w1_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        a1_scale: torch.Tensor,
+        block_shape: List[int],
+        smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+    return torch.empty_like(a1, dtype=torch.bf16)
+
+
+def rocm_aiter_asm_moe_impl(hidden_states: torch.Tensor,
+                            w1: torch.Tensor,
+                            w2: torch.Tensor,
+                            topk_weight: torch.Tensor,
+                            topk_ids: torch.Tensor,
+                            fc1_scale: Optional[torch.Tensor] = None,
+                            fc2_scale: Optional[torch.Tensor] = None,
+                            fc1_smooth_scale: Optional[torch.Tensor] = None,
+                            fc2_smooth_scale: Optional[torch.Tensor] = None,
+                            a16: bool = False,
+                            activation: str = "silu") -> torch.Tensor:
     import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
+    from aiter import ActivationType
+
+    assert activation in ["silu", "gelu"], "The given activation:" \
+                                          f" {activation}"         \
+                                           " is not supported in" \
+                                           " AITER."
+    if activation == "silu":
+        aiter_activation = ActivationType.Silu
+    else:
+        aiter_activation = ActivationType.Gelu
+
+    return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
+                                       w1=w1,
+                                       w2=w2,
+                                       topk_weight=topk_weight,
+                                       topk_ids=topk_ids,
+                                       fc1_scale=fc1_scale,
+                                       fc2_scale=fc2_scale,
+                                       fc1_smooth_scale=fc1_smooth_scale,
+                                       fc2_smooth_scale=fc2_smooth_scale,
+                                       a16=a16,
+                                       activation=aiter_activation)
+
+
+def rocm_aiter_asm_moe_fake(hidden_states: torch.Tensor,
+                            w1: torch.Tensor,
+                            w2: torch.Tensor,
+                            topk_weight: torch.Tensor,
+                            topk_ids: torch.Tensor,
+                            fc1_scale: Optional[torch.Tensor] = None,
+                            fc2_scale: Optional[torch.Tensor] = None,
+                            fc1_smooth_scale: Optional[torch.Tensor] = None,
+                            fc2_smooth_scale: Optional[torch.Tensor] = None,
+                            a16: bool = False,
+                            activation: str = "silu") -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+def rocm_aiter_topk_softmax_impl(topk_weights: torch.Tensor,
+                                 topk_indices: torch.Tensor,
+                                 token_expert_indices: torch.Tensor,
+                                 gating_output: torch.Tensor,
+                                 renormalize: bool) -> None:
+    from aiter import topk_softmax
+    topk_softmax(topk_weights, topk_indices, token_expert_indices,
+                 gating_output, renormalize)
+
+
+def rocm_aiter_topk_softmax_fake(topk_weights: torch.Tensor,
+                                 topk_indices: torch.Tensor,
+                                 token_expert_indices: torch.Tensor,
+                                 gating_output: torch.Tensor,
+                                 renormalize: bool) -> None:
+    pass
+
+
+if current_platform.is_rocm():
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_asm_moe_tkw1",
+        op_func=rocm_aiter_asm_moe_tkw1_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_ck_moe",
+        op_func=rocm_aiter_ck_moe_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_ck_moe_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_fmoe_fp8_blockscale_g1u1",
+        op_func=rocm_aiter_fmoe_fp8_blockscale_g1u1_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_fmoe_fp8_blockscale_g1u1_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_asm_moe",
+        op_func=rocm_aiter_asm_moe_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_asm_moe_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+    direct_register_custom_op(
+        op_name="rocm_aiter_topk_softmax",
+        op_func=rocm_aiter_topk_softmax_impl,
+        mutates_args=["topk_weights", "topk_indices", "token_expert_indices"],
+        fake_impl=rocm_aiter_topk_softmax_fake,
+        dispatch_key=current_platform.dispatch_key,
+    )
+
+
+def rocm_aiter_fused_experts(hidden_states: torch.Tensor,
+                             w1: torch.Tensor,
+                             w2: torch.Tensor,
+                             topk_weights: torch.Tensor,
+                             topk_ids: torch.Tensor,
+                             inplace: bool = False,
+                             activation: str = "silu",
+                             apply_router_weight_on_input: bool = False,
+                             use_fp8_w8a8: bool = False,
+                             use_int8_w8a8: bool = False,
+                             use_int8_w8a16: bool = False,
+                             use_int4_w4a16: bool = False,
+                             per_channel_quant: bool = False,
+                             global_num_experts: int = -1,
+                             expert_map: Optional[torch.Tensor] = None,
+                             w1_scale: Optional[torch.Tensor] = None,
+                             w2_scale: Optional[torch.Tensor] = None,
+                             w1_zp: Optional[torch.Tensor] = None,
+                             w2_zp: Optional[torch.Tensor] = None,
+                             a1_scale: Optional[torch.Tensor] = None,
+                             a2_scale: Optional[torch.Tensor] = None,
+                             block_shape: Optional[List[int]] = None,
+                             allow_deep_gemm: bool = False) -> torch.Tensor:
 
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
         per_token_group_quant_fp8)
@@ -84,60 +290,24 @@ def rocm_aiter_fused_experts(
     topk_weights = topk_weights.to(torch.float32)
     topk_ids = topk_ids.to(torch.int32)
 
-    if (block_shape is not None) and use_fp8_w8a8:
+    # w8a8 block-scaled
+    if block_shape is not None and use_fp8_w8a8:
         assert not apply_router_weight_on_input, (
             "apply_router_weight_on_input is not supported for block scaled moe"
         )
-
         assert w1_scale is not None
         assert w2_scale is not None
 
-        local_E = E = w1.shape[0]
-        if expert_map is not None:
-            E = expert_map.numel()
-
-        topk = topk_ids.shape[1]
-        model_dim = w1.shape[-1]
-        dtype = hidden_states.dtype
         # The default block sizes are 128 in AITER.
-        if block_shape is None:
-            block_shape = [128, 128]
-
-        scale_blk_k = block_shape[1]
-
-        (
-            sorted_token_ids,
-            sorted_weight_buf,
-            sorted_expert_ids,
-            num_valid_ids,
-            out_asm,
-        ) = rocm_aiter_asm_fmoe.moe_sorting_ck(topk_ids,
-                                               topk_weights,
-                                               E,
-                                               model_dim,
-                                               dtype,
-                                               expert_mask=expert_map)
-
-        a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k)
-        rocm_aiter.fmoe_fp8_blockscale_g1u1(
-            out_asm,
-            a1,
-            w1,
-            w2,
-            sorted_token_ids,
-            sorted_weight_buf,
-            sorted_expert_ids,
-            num_valid_ids,
-            topk,
-            w1_scale.view(local_E, -1),
-            w2_scale.view(local_E, -1),
-            a1_scale.t().contiguous(),
-            block_shape[0],
-            block_shape[1],
-            None,
-        )
-        return out_asm
+        block_shape = [128, 128] if block_shape is None else block_shape
+
+        a1, a1_scale = per_token_group_quant_fp8(hidden_states, block_shape[1])
 
+        return torch.ops.vllm.rocm_aiter_fmoe_fp8_blockscale_g1u1(
+            topk_ids, topk_weights, hidden_states.dtype, expert_map, a1, w1,
+            w2, w1_scale, w2_scale, a1_scale, block_shape, None)
+
+    # w8a8 per-channel quantization
     elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
         # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
         # This applies topk_weights on the GEMM output of the first FC layer
@@ -148,34 +318,36 @@ def rocm_aiter_fused_experts(
             "Only support topk=1 when"
             " `apply_router_weight_on_input` is True")
 
-        return rocm_aiter_asm_moe_tkw1(hidden_states,
-                                       w1,
-                                       w2,
-                                       topk_weights,
-                                       topk_ids,
-                                       fc1_scale=w1_scale,
-                                       fc2_scale=w2_scale,
-                                       fc1_smooth_scale=None,
-                                       fc2_smooth_scale=None,
-                                       a16=False,
-                                       per_tensor_quant_scale=None,
-                                       expert_mask=expert_map,
-                                       activation_str=activation)
-
+        return torch.ops.vllm.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale=w1_scale,
+            fc2_scale=w2_scale,
+            fc1_smooth_scale=None,
+            fc2_smooth_scale=None,
+            a16=False,
+            per_tensor_quant_scale=None,
+            expert_mask=expert_map,
+            activation_str=activation)
+
+    # w8a8 per-tensor activation per-tensor weight
     elif use_fp8_w8a8:
         assert not apply_router_weight_on_input, (
             "apply_router_weight_on_input is not supported for fp8_w8a8")
-        return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
-                                           w1=w1,
-                                           w2=w2,
-                                           topk_weight=topk_weights,
-                                           topk_ids=topk_ids,
-                                           fc1_scale=w1_scale,
-                                           fc2_scale=w2_scale,
-                                           fc1_smooth_scale=None,
-                                           fc2_smooth_scale=None,
-                                           a16=False)
-
+        return torch.ops.vllm.rocm_aiter_asm_moe(hidden_states=hidden_states,
+                                                 w1=w1,
+                                                 w2=w2,
+                                                 topk_weight=topk_weights,
+                                                 topk_ids=topk_ids,
+                                                 fc1_scale=w1_scale,
+                                                 fc2_scale=w2_scale,
+                                                 fc1_smooth_scale=None,
+                                                 fc2_smooth_scale=None,
+                                                 a16=False,
+                                                 activation=activation)
     if apply_router_weight_on_input:
         assert (topk_weights.dim() == 2
                 ), "`topk_weights` should be in shape (num_tokens, topk)"
@@ -188,26 +360,26 @@ def rocm_aiter_fused_experts(
         topk_ids = topk_ids.to(torch.int32)
         topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
 
-    return rocm_aiter.ck_moe(hidden_states=hidden_states,
-                             w1=w1,
-                             w2=w2,
-                             topk_weights=topk_weights,
-                             topk_ids=topk_ids)
+    # w16a16 fallback to rocm_aiter_ck_moe w16a16
+    return torch.ops.vllm.rocm_aiter_ck_moe(hidden_states=hidden_states,
+                                            w1=w1,
+                                            w2=w2,
+                                            topk_weights=topk_weights,
+                                            topk_ids=topk_ids)
 
 
 def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
                             topk_indices: torch.Tensor,
                             token_expert_indices: torch.Tensor,
                             gating_output: torch.Tensor,
-                            renormalize: bool) -> tuple[torch.Tensor, ...]:
-    import aiter as rocm_aiter
-    rocm_aiter.topk_softmax(topk_weights, topk_indices, token_expert_indices,
-                            gating_output, renormalize)
-
+                            renormalize: bool) -> Tuple[torch.Tensor, ...]:
+    torch.ops.vllm.rocm_aiter_topk_softmax(topk_weights, topk_indices,
+                                           token_expert_indices, gating_output,
+                                           renormalize)
     return topk_weights, topk_indices
 
 
-def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+def shuffle_weights(*tensors: torch.Tensor) -> Tuple[torch.Tensor, ...]:
     """
     Applies shuffle_weight function from AITER to each 
     input tensor and returns them.
@@ -216,15 +388,14 @@ def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
     *tensors: Variable number of torch.Tensor objects.
 
     Returns:
-    A tuple of shuffled tensors.
+    A Tuple of shuffled tensors.
     """
     from aiter.ops.shuffle import shuffle_weight
-
     return tuple(shuffle_weight(tensor) for tensor in tensors)
 
 
 def expand_weights(*tensors: torch.Tensor,
-                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
+                   expansion_dims: list[int]) -> Tuple[torch.Tensor, ...]:
     """
     Expands the dimensions of input tensors.
 
@@ -234,7 +405,7 @@ def expand_weights(*tensors: torch.Tensor,
         corresponding to each tensor.
 
     Returns:
-        A tuple of tensors with expanded dimensions.
+        A Tuple of tensors with expanded dimensions.
     """
 
     assert len(tensors) == len(expansion_dims), \
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index d74d4e9273b7..721e36af2b28 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -304,9 +304,9 @@ def apply(
             e_score_correction_bias=e_score_correction_bias)
 
         return self.fused_experts_func(
-            x,
-            layer.w13_weight,
-            layer.w2_weight,
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,

From 41ca7eb49192cbee65986527319c6eb929d6aa7c Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 24 Apr 2025 23:12:21 -0400
Subject: [PATCH 64/81] [Attention] FA3 decode perf improvement - single mma
 warp group support for head dim 128 (#16864)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 110ef266c665..c60a54745519 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa
+          GIT_TAG e93779c59ba4905e56e5c39dc2c1904ada71fa21
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 69bff9bc893475fbd64c47633cb8ece46cd462c7 Mon Sep 17 00:00:00 2001
From: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com>
Date: Fri, 25 Apr 2025 11:16:32 +0800
Subject: [PATCH 65/81] fix float16 support for kimi-vl (#17156)

Co-authored-by: zhouzaida <zhouzaida@msh.team>
---
 vllm/model_executor/models/kimi_vl.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py
index 5c39907d79e4..8cb8bc22fc0e 100644
--- a/vllm/model_executor/models/kimi_vl.py
+++ b/vllm/model_executor/models/kimi_vl.py
@@ -340,8 +340,7 @@ def _parse_and_validate_image_input(
         else:
             pixel_values = pixel_values.reshape(-1, num_channels, patch_size,
                                                 patch_size)
-        # fp32 -> bf16
-        pixel_values = pixel_values.to(torch.bfloat16)
+        pixel_values = pixel_values.to(self.vision_tower.dtype)
         # image_grid_hws.shape = (N, 2)
         assert image_grid_hws.ndim == 2, f"unexpected shape for image_grid_hws: {image_grid_hws.shape}"
 

From 7a0a9da72bad7c649d4c74b363040e0744e5d37d Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 24 Apr 2025 23:17:22 -0400
Subject: [PATCH 66/81] [Doc] V1 : Update LoRA status (#17133)

Signed-off-by: varun sundar rabindranath <vsundarr@redhat.com>
Co-authored-by: varun sundar rabindranath <vsundarr@redhat.com>
---
 docs/source/getting_started/v1_user_guide.md | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index a87484c3bb04..de90b8a7851e 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -44,8 +44,8 @@ This living user guide outlines a few known **important changes and limitations*
 |-----------------|-----------------------------------------------------------------------------------|
 | **Prefix Caching**                    | <nobr>🚀 Optimized</nobr>                                                        |
 | **Chunked Prefill**                    | <nobr>🚀 Optimized</nobr>                                                        |
+| **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
 | **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
 | **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
@@ -121,11 +121,6 @@ Although we have re-implemented and partially optimized many features and models
 These features are already supported in vLLM V1, but their optimization is still
 in progress.
 
-- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
-  inferior to that of V0. The team is actively working on improving its
-  performance
-(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
-
 - **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
   will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
 

From 649818995f60b410a57059445ec1d66f8db347b5 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 24 Apr 2025 22:20:04 -0600
Subject: [PATCH 67/81] [Docs] Fix True->true in supported_models.md (#17141)

---
 docs/source/models/supported_models.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 01427f4f2ba4..6b101662fc14 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1082,7 +1082,7 @@ See [this page](#generative-models) for more information on how to use generativ
 
 :::{important}
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
-You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
+You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": true}'`.
 :::
 
 :::{warning}
@@ -1097,7 +1097,7 @@ V0 correctly implements the model's attention pattern:
 
 V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
-- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
 - Will be updated in the future to support the correct behavior
 
 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
@@ -1129,7 +1129,7 @@ To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from
 `pip install git+https://github.com/huggingface/transformers.git`.
 
 Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
-`--mm-processor-kwargs '{"use_audio_in_video": True}'`.
+`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
 :::
 
 ### Pooling Models

From 6ca02344780a745ccb31511ee97f4a1e59228bbf Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 25 Apr 2025 06:48:53 +0100
Subject: [PATCH 68/81] Move missed `SchedulerConfig` args into scheduler
 config group in `EngineArgs` (#17131)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config.py           |  3 ++-
 vllm/engine/arg_utils.py | 23 +++++++----------------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 41a30efea039..8551e0dd734a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1778,6 +1778,7 @@ def _verify_args(self) -> None:
             "worker_extension_cls must be a string (qualified class name).")
 
 
+PreemptionMode = Literal["swap", "recompute"]
 SchedulerPolicy = Literal["fcfs", "priority"]
 
 
@@ -1854,7 +1855,7 @@ class SchedulerConfig:
     NOTE: This is not currently configurable. It will be overridden by
     max_num_batched_tokens in case max multimodal embedding size is larger."""
 
-    preemption_mode: Optional[str] = None
+    preemption_mode: Optional[PreemptionMode] = None
     """Whether to perform preemption by swapping or
     recomputation. If not specified, we determine the mode as follows:
     We use recomputation by default since it incurs lower overhead than
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9cb2aa797be5..68bea93b3354 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -753,12 +753,6 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
         )
         device_group.add_argument("--device", **device_kwargs["device"])
 
-        parser.add_argument('--num-scheduler-steps',
-                            type=int,
-                            default=1,
-                            help=('Maximum number of forward steps per '
-                                  'scheduler call.'))
-
         # Speculative arguments
         speculative_group = parser.add_argument_group(
             title="SpeculativeConfig",
@@ -779,13 +773,6 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
             help="The pattern(s) to ignore when loading the model."
             "Default to `original/**/*` to avoid repeated loading of llama's "
             "checkpoints.")
-        parser.add_argument(
-            '--preemption-mode',
-            type=str,
-            default=None,
-            help='If \'recompute\', the engine performs preemption by '
-            'recomputing; If \'swap\', the engine performs preemption by '
-            'block swapping.')
 
         parser.add_argument(
             "--served-model-name",
@@ -865,14 +852,18 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
                                      **scheduler_kwargs["num_lookahead_slots"])
         scheduler_group.add_argument('--scheduler-delay-factor',
                                      **scheduler_kwargs["delay_factor"])
-        scheduler_group.add_argument(
-            '--enable-chunked-prefill',
-            **scheduler_kwargs["enable_chunked_prefill"])
+        scheduler_group.add_argument('--preemption-mode',
+                                     **scheduler_kwargs["preemption_mode"])
+        scheduler_group.add_argument('--num-scheduler-steps',
+                                     **scheduler_kwargs["num_scheduler_steps"])
         scheduler_group.add_argument(
             '--multi-step-stream-outputs',
             **scheduler_kwargs["multi_step_stream_outputs"])
         scheduler_group.add_argument('--scheduling-policy',
                                      **scheduler_kwargs["policy"])
+        scheduler_group.add_argument(
+            '--enable-chunked-prefill',
+            **scheduler_kwargs["enable_chunked_prefill"])
         scheduler_group.add_argument(
             "--disable-chunked-mm-input",
             **scheduler_kwargs["disable_chunked_mm_input"])

From 5aa6efb9a5929d9b4b6b2c231b81f63587e83c77 Mon Sep 17 00:00:00 2001
From: Lifu Huang <lifu.hlf@gmail.com>
Date: Thu, 24 Apr 2025 22:49:30 -0700
Subject: [PATCH 69/81] [Misc] Clean up redundant code in uniproc_executor.py
 (#16762)

Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
---
 vllm/executor/uniproc_executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 8c004c790fcb..2e4b47c1e24a 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -34,13 +34,13 @@ def _init_executor(self) -> None:
         if len(device_info) > 1:
             local_rank = int(device_info[1])
         rank = 0
+        is_driver_worker = True
         kwargs = dict(
             vllm_config=self.vllm_config,
             local_rank=local_rank,
             rank=rank,
             distributed_init_method=distributed_init_method,
-            is_driver_worker=(not self.parallel_config)
-            or (rank % self.parallel_config.tensor_parallel_size == 0),
+            is_driver_worker=is_driver_worker,
         )
         self.collective_rpc("init_worker", args=([kwargs], ))
         self.collective_rpc("init_device")

From 2f54045508930a741f3638cc3c1174fdd672087f Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Fri, 25 Apr 2025 13:51:02 +0800
Subject: [PATCH 70/81] [Bugfix][Misc] Use TritonPlaceholderModule to
 defensively import triton (#15099)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 benchmarks/kernels/benchmark_lora.py          | 10 ++++-
 .../layers/mamba/ops/mamba_ssm.py             |  4 +-
 vllm/triton_utils/__init__.py                 |  2 +-
 vllm/triton_utils/importing.py                | 40 ++++++++++++++++++-
 vllm/utils.py                                 |  3 ++
 5 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index b4b91eda2844..d382ede10b41 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -17,8 +17,14 @@
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES
 
-from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
-from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.triton_utils import HAS_TRITON
+
+if HAS_TRITON:
+    from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
+                                          lora_shrink)
+    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
+                                                _LORA_B_PTR_DICT)
+
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index b31b980fbe84..9fbad9d2f91e 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -10,8 +10,10 @@
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.triton_utils import HAS_TRITON
 
-TRITON3 = version.parse(triton.__version__) >= version.parse("3.0.0")
+TRITON3 = HAS_TRITON and (version.parse(triton.__version__)
+                          >= version.parse("3.0.0"))
 
 if TRITON3:
 
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 43918bcd7c55..bffc56a2e75c 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -2,4 +2,4 @@
 
 from vllm.triton_utils.importing import HAS_TRITON
 
-__all__ = ["HAS_TRITON"]
\ No newline at end of file
+__all__ = ["HAS_TRITON"]
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index a20700248c26..fa29efbf6b2d 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -1,17 +1,53 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import sys
+import types
 from importlib.util import find_spec
 
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
 HAS_TRITON = (
     find_spec("triton") is not None
-    and not current_platform.is_xpu()  # Not compatible
+    or find_spec("pytorch-triton-xpu") is not None  # Not compatible
 )
 
 if not HAS_TRITON:
     logger.info("Triton not installed or not compatible; certain GPU-related"
                 " functions will not be available.")
+
+    class TritonPlaceholder(types.ModuleType):
+
+        def __init__(self):
+            super().__init__("triton")
+            self.jit = self._dummy_decorator("jit")
+            self.autotune = self._dummy_decorator("autotune")
+            self.heuristics = self._dummy_decorator("heuristics")
+            self.language = TritonLanguagePlaceholder()
+            logger.warning_once(
+                "Triton is not installed. Using dummy decorators. "
+                "Install it via `pip install triton` to enable kernel"
+                "compilation.")
+
+        def _dummy_decorator(self, name):
+
+            def decorator(func=None, **kwargs):
+                if func is None:
+                    return lambda f: f
+                return func
+
+            return decorator
+
+    class TritonLanguagePlaceholder(types.ModuleType):
+
+        def __init__(self):
+            super().__init__("triton.language")
+            self.constexpr = None
+            self.dtype = None
+
+    sys.modules['triton'] = TritonPlaceholder()
+    sys.modules['triton.language'] = TritonLanguagePlaceholder()
+
+if 'triton' in sys.modules:
+    logger.info("Triton module has been replaced with a placeholder.")
diff --git a/vllm/utils.py b/vllm/utils.py
index ed406a6b7b11..92b493a69e58 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -63,6 +63,9 @@
 from typing_extensions import Never, ParamSpec, TypeIs, assert_never
 
 import vllm.envs as envs
+# NOTE: import triton_utils to make TritonPlaceholderModule work
+#       if triton is unavailable
+import vllm.triton_utils  # noqa: F401
 from vllm.logger import enable_trace_function_call, init_logger
 
 if TYPE_CHECKING:

From 881f735827a466f1efd7c7100bd423e4081a2ce5 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 25 Apr 2025 01:53:55 -0400
Subject: [PATCH 71/81] [Misc] Benchmark Serving Script Support Appending
 Results (#17028)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 benchmarks/benchmark_serving.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index bc125f6b73dc..da124e1a81b4 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -713,7 +713,7 @@ def main(args: argparse.Namespace):
         ))
 
     # Save config and results to json
-    if args.save_result:
+    if args.save_result or args.append_result:
         result_json: dict[str, Any] = {}
 
         # Setup
@@ -734,6 +734,14 @@ def main(args: argparse.Namespace):
                     raise ValueError(
                         "Invalid metadata format. Please use KEY=VALUE format."
                     )
+        # Traffic
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
 
         if not args.save_detailed:
             # Remove fields with too many data points
@@ -744,15 +752,6 @@ def main(args: argparse.Namespace):
                 if field in result_json:
                     del result_json[field]
 
-        # Traffic
-        result_json["request_rate"] = (args.request_rate if args.request_rate
-                                       < float("inf") else "inf")
-        result_json["burstiness"] = args.burstiness
-        result_json["max_concurrency"] = args.max_concurrency
-
-        # Merge with benchmark result
-        result_json = {**result_json, **benchmark_result}
-
         # Save to file
         base_model_id = model_id.split("/")[-1]
         max_concurrency_str = (f"-concurrency{args.max_concurrency}"
@@ -762,7 +761,12 @@ def main(args: argparse.Namespace):
             file_name = args.result_filename
         if args.result_dir:
             file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w", encoding='utf-8') as outfile:
+        with open(file_name,
+                  mode="a+" if args.append_result else "w",
+                  encoding='utf-8') as outfile:
+            # Append a newline.
+            if args.append_result and outfile.tell() != 0:
+                outfile.write("\n")
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
@@ -894,6 +898,11 @@ def main(args: argparse.Namespace):
         help="When saving the results, whether to include per request "
         "information such as response, error, ttfs, tpots, etc.",
     )
+    parser.add_argument(
+        "--append-result",
+        action="store_true",
+        help="Append the benchmark result to the existing json file.",
+    )
     parser.add_argument(
         "--metadata",
         metavar="KEY=VALUE",

From b22980a1dc894d98de64119af4a580881f16b2f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?yexin=28=E5=8F=B6=E9=91=AB=29?= <yexin93@qq.com>
Date: Fri, 25 Apr 2025 14:52:28 +0800
Subject: [PATCH 72/81] [Perf]Optimize rotary_emb implementation to use Triton
 operator for improved inference performance (#16457)

Signed-off-by: cynthieye <yexin93@qq.com>
Co-authored-by: MagnetoWang <magnetowang@outlook.com>
---
 cmake/external_projects/vllm_flash_attn.cmake |  2 +-
 .../model_executor/layers/rotary_embedding.py | 34 +++++++++++++------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index c60a54745519..b04e4c2d06ed 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG e93779c59ba4905e56e5c39dc2c1904ada71fa21
+          GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index e6f2461eb674..6f4d796fb040 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -46,20 +46,12 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
     return x.flatten(-2)
 
 
-def _apply_rotary_emb(
+def _apply_rotary_emb_torch(
     x: torch.Tensor,
     cos: torch.Tensor,
     sin: torch.Tensor,
     is_neox_style: bool,
 ) -> torch.Tensor:
-    """
-    Args:
-        x: [num_tokens, num_heads, head_size]
-        cos: [num_tokens, head_size // 2]
-        sin: [num_tokens, head_size // 2]
-        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
-            positional embeddings.
-    """
     cos = cos.unsqueeze(-2).to(x.dtype)
     sin = sin.unsqueeze(-2).to(x.dtype)
     if is_neox_style:
@@ -75,6 +67,24 @@ def _apply_rotary_emb(
         return torch.stack((o1, o2), dim=-1).flatten(-2)
 
 
+def _apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                      is_neox_style: bool) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    if current_platform.is_cuda_alike():
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+        return apply_rotary_emb(x.unsqueeze(0), cos, sin,
+                                not is_neox_style).squeeze(0)
+    else:
+        return _apply_rotary_emb_torch(x, cos, sin, is_neox_style)
+
+
 @CustomOp.register("rotary_embedding")
 class RotaryEmbedding(CustomOp):
     """Original rotary positional embedding."""
@@ -141,14 +151,16 @@ def forward_native(
         query = query.view(num_tokens, -1, self.head_size)
         query_rot = query[..., :self.rotary_dim]
         query_pass = query[..., self.rotary_dim:]
-        query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query_rot = _apply_rotary_emb_torch(query_rot, cos, sin,
+                                            self.is_neox_style)
         query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
 
         key_shape = key.shape
         key = key.view(num_tokens, -1, self.head_size)
         key_rot = key[..., :self.rotary_dim]
         key_pass = key[..., self.rotary_dim:]
-        key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key_rot = _apply_rotary_emb_torch(key_rot, cos, sin,
+                                          self.is_neox_style)
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 

From 6aae216b4e135e6f6889e0bc80efc8a666e264e8 Mon Sep 17 00:00:00 2001
From: Sangyeon Cho <josang1204@gmail.com>
Date: Fri, 25 Apr 2025 15:54:43 +0900
Subject: [PATCH 73/81] [Bugfix] remove fallback in guided_json (int range,
 patterns) (#16725)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: csy1204 <josang1204@gmail.com>
Co-authored-by: 조상연[플레이스 AI] <sang-yeon.cho@navercorp.com>
---
 tests/entrypoints/llm/test_guided_generate.py | 58 ++++++++++++++++++-
 tests/v1/entrypoints/conftest.py              | 34 ++++++-----
 tests/v1/structured_output/test_utils.py      | 53 +++++------------
 .../guided_decoding/__init__.py               |  2 +-
 vllm/model_executor/guided_decoding/utils.py  | 10 +---
 vllm/v1/structured_output/backend_xgrammar.py |  9 +--
 6 files changed, 94 insertions(+), 72 deletions(-)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 6bc32ebadc7a..ad726fa8ce51 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -305,7 +305,7 @@ def test_disable_guided_decoding_fallback(sample_regex, llm):
     with pytest.raises(
             ValueError,
             match="xgrammar does not support advanced JSON schema features "
-            "like enums, patterns or numeric ranges."):
+            "like string length, item limits, or property bounds."):
         llm.generate(prompts="This should fail",
                      sampling_params=sampling_params,
                      use_tqdm=True)
@@ -386,6 +386,62 @@ def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
         jsonschema.validate(instance=output_json, schema=json_schema)
 
 
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_number_range_json_completion(llm,
+                                             guided_decoding_backend: str):
+    sample_output_schema = {
+        "type": "object",
+        "properties": {
+            "age": {
+                "type": "integer",
+                "minimum": 18,
+                "maximum": 99
+            },
+            "score": {
+                "type": "number",
+                "minimum": 0.0,
+                "maximum": 100.0
+            },
+            "zipcode": {
+                "type": "string",
+                "pattern": r"^\d{5}(-\d{4})?$"
+            },
+        },
+        "required": ["age", "score", "zipcode"],
+    }
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_output_schema,
+                                             backend=guided_decoding_backend),
+    )
+    outputs = llm.generate(
+        prompts=[
+            "Create a JSON object for a user with age, score, and zipcode."
+        ] * 2,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_output_schema)
+        assert 18 <= output_json["age"] <= 99
+        assert 0.0 <= output_json["score"] <= 100.0
+        assert (re.fullmatch(r"^\d{5}(-\d{4})?$", output_json["zipcode"])
+                is not None)
+
+
 @pytest.mark.skip_global_cleanup
 def test_guidance_no_additional_properties(llm):
     schema = {
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index 6d4278b4c871..d84b2b22db12 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -47,6 +47,14 @@ def sample_json_schema():
                     "type": "string",
                 }
             },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$"  # Regex pattern
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+            },
             "work_history": {
                 "type": "array",
                 "items": {
@@ -56,17 +64,20 @@ def sample_json_schema():
                             "type": "string"
                         },
                         "duration": {
-                            "type": "number"
+                            "type": "number",
+                            "minimum": 0.0,
+                            "maximum": 100.0,  # Numeric range
                         },
                         "position": {
                             "type": "string"
                         }
                     },
-                    "required": ["company", "position"]
+                    "required": ["company", "duration", "position"]
                 }
             }
         },
-        "required": ["name", "age", "skills", "work_history"]
+        "required":
+        ["name", "age", "skills", "grade", "email", "work_history"]
     }
 
 
@@ -78,27 +89,18 @@ def unsupported_json_schema():
         "properties": {
             "score": {
                 "type": "integer",
-                "minimum": 0,
-                "maximum": 100  # Numeric range
-            },
-            "grade": {
-                "type": "string",
-                "pattern": "^[A-D]$"  # Regex pattern
-            },
-            "email": {
-                "type": "string",
-                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+                "multipleOf": 5  # Numeric multiple
             },
             "tags": {
                 "type": "array",
                 "items": {
                     "type": "string",
-                    "pattern":
-                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
+                    "minLength": 10,
+                    "maxLength": 20
                 }
             }
         },
-        "required": ["score", "grade", "email", "tags"]
+        "required": ["score", "tags"]
     }
 
 
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 337df4517ae9..1cefe8726df7 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -9,10 +9,6 @@
 @pytest.fixture
 def unsupported_string_schemas():
     return [
-        {
-            "type": "string",
-            "pattern": "^[a-zA-Z]+$"
-        },
         {
             "type": "string",
             "format": "email"
@@ -23,22 +19,6 @@ def unsupported_string_schemas():
 @pytest.fixture
 def unsupported_integer_schemas():
     return [
-        {
-            "type": "integer",
-            "minimum": 0
-        },
-        {
-            "type": "integer",
-            "maximum": 120
-        },
-        {
-            "type": "integer",
-            "exclusiveMinimum": 120
-        },
-        {
-            "type": "integer",
-            "exclusiveMaximum": 120
-        },
         {
             "type": "integer",
             "multipleOf": 120
@@ -49,22 +29,6 @@ def unsupported_integer_schemas():
 @pytest.fixture
 def unsupported_number_schemas():
     return [
-        {
-            "type": "number",
-            "minimum": 0
-        },
-        {
-            "type": "number",
-            "maximum": 120
-        },
-        {
-            "type": "number",
-            "exclusiveMinimum": 120
-        },
-        {
-            "type": "number",
-            "exclusiveMaximum": 120
-        },
         {
             "type": "number",
             "multipleOf": 120
@@ -156,13 +120,28 @@ def supported_schema():
                 "type": "string",
                 "enum": ["sedan", "suv", "truck"]
             },
+            "car_brand": {
+                "type": "string",
+                "pattern": "^[a-zA-Z]+$"
+            },
             "short_description": {
                 "type": "string",
                 "maxLength": 50
             },
+            "mileage": {
+                "type": "number",
+                "minimum": 0,
+                "maximum": 1000000
+            },
+            "model_year": {
+                "type": "integer",
+                "exclusiveMinimum": 1900,
+                "exclusiveMaximum": 2100
+            },
             "long_description": {
                 "type": "string",
-                "minLength": 50
+                "minLength": 50,
+                "maxLength": 2000
             },
             "address": {
                 "type": "object",
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index d4ee1be9a445..8fdcdcafa980 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -65,7 +65,7 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
             fallback_or_error(
                 guided_params,
                 "xgrammar does not support advanced JSON schema features like "
-                "enums, patterns or numeric ranges.", "outlines")
+                "string length, item limits, or property bounds.", "outlines")
 
         # xgrammar only supports GBNF grammars, so we must convert Lark.
         # We must check if the grammar is likely Lark and if that
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index ba7c10252699..1ad1ef8fbf16 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -10,16 +10,8 @@ def check_object(obj: dict) -> bool:
         if not isinstance(obj, dict):
             return False
 
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
         # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and any(
-                key in obj for key in [
-                    "minimum", "maximum", "exclusiveMinimum",
-                    "exclusiveMaximum", "multipleOf"
-                ]):
+        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
             return True
 
         # Check for array unsupported keywords
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index d978ae2da85c..bb7c7edc278d 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -179,15 +179,8 @@ def check_object(obj: dict[str, Any]) -> bool:
         if not isinstance(obj, dict):
             return False
 
-        # Check for pattern restrictions
-        if "pattern" in obj:
-            return True
-
         # Check for numeric ranges
-        if obj.get("type") in ("integer", "number") and any(
-                key in obj
-                for key in ("minimum", "maximum", "exclusiveMinimum",
-                            "exclusiveMaximum", "multipleOf")):
+        if obj.get("type") in ("integer", "number") and ("multipleOf" in obj):
             return True
 
         # Check for array unsupported keywords

From a41351f363f3e7a212582e51b2c1c35c18aaa9df Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 25 Apr 2025 02:45:02 -0500
Subject: [PATCH 74/81] [Quantization][FP8] Add support for FP8 models with
 input_scale for output projection and QK quantization (#15734)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Luka Govedič <lgovedic@redhat.com>
---
 vllm/attention/backends/abstract.py           |  1 +
 vllm/attention/backends/rocm_flash_attn.py    |  7 +++
 vllm/attention/layer.py                       |  1 +
 vllm/config.py                                | 11 +++++
 vllm/engine/arg_utils.py                      | 17 +++++++
 .../model_executor/layers/quantization/fp8.py |  5 ++
 .../layers/quantization/kv_cache.py           | 36 ++++++++++++++
 .../layers/quantization/quark/quark.py        | 47 +++++++++++--------
 8 files changed, 105 insertions(+), 20 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 82d60f9da7da..2149e89aa734 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -237,6 +237,7 @@ class AttentionLayer(Protocol):
     _v_scale: torch.Tensor
     _k_scale_float: float
     _v_scale_float: float
+    _prob_scale: torch.Tensor
 
     def forward(
         self,
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 37b6cadcb98a..8076c4791d3c 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -766,6 +766,12 @@ def forward(
                             query.dtype,
                             seq_lens,
                             make_attn_mask=causal_mask)  # type: ignore
+                    use_fp8_scales = (layer._q_scale and layer._k_scale
+                                      and layer._v_scale and layer._prob_scale
+                                      and self.kv_cache_dtype == "fp8")
+                    full_scales = (
+                        layer._q_scale, layer._k_scale, layer._v_scale,
+                        layer._prob_scale) if use_fp8_scales else None
                     self.triton_attn_func(
                         query,
                         key,
@@ -779,6 +785,7 @@ def forward(
                         self.scale,
                         attn_masks[0][None]
                         if attn_masks is not None else None,
+                        full_scales,
                     )
                 elif self.use_naive_attn:
                     if self.num_kv_heads != self.num_heads:
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 68452f4c03b0..aa218cc37af9 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -90,6 +90,7 @@ def __init__(
         # FlashAttn doesn't support quantizing the kv-cache only
         # but requires q to be quantized as well.
         self._q_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
 
         # We also keep the float32 versions of k/v_scale for attention
         # backends that don't support tensors (Flashinfer)
diff --git a/vllm/config.py b/vllm/config.py
index 8551e0dd734a..0eb15825a3b9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3767,6 +3767,17 @@ def _get_quantization_config(
             return quant_config
         return None
 
+    @staticmethod
+    def get_quantization_config(
+            model_config: ModelConfig,
+            load_config: LoadConfig) -> Optional[QuantizationConfig]:
+        import copy
+
+        # For some reason, the _ version of this modifies the model_config
+        # object, so using deepcopy to avoid this problem.
+        return VllmConfig._get_quantization_config(copy.deepcopy(model_config),
+                                                   load_config)
+
     def with_hf_config(
         self,
         hf_config: PretrainedConfig,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 68bea93b3354..326bea0cb6a3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1368,6 +1368,23 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
+        if current_platform.is_rocm():
+            from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+            load_config = self.create_load_config()
+            quantization_config = VllmConfig.get_quantization_config(
+                model_config, load_config)
+            if isinstance(quantization_config, Fp8Config):
+                _raise_or_fallback(feature_name="fp8 for ROCm",
+                                   recommend_to_remove=False)
+                return False
+            from vllm.model_executor.layers.quantization.quark.quark import (
+                QuarkConfig)
+
+            if isinstance(quantization_config, QuarkConfig
+                          ) and quantization_config.has_fp8_layer_weights():
+                _raise_or_fallback(feature_name="Quark fp8 for ROCm",
+                                   recommend_to_remove=False)
+
         # No Fp8 KV cache so far.
         if self.kv_cache_dtype != "auto":
             fp8_attention = self.kv_cache_dtype.startswith("fp8")
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index be76785baccc..01056c37b86c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -140,6 +140,11 @@ def get_cache_scale(self, name: str) -> Optional[str]:
             return name.replace(".k_proj.output_scale", ".attn.k_scale")
         if name.endswith(".output_scale") and ".v_proj" in name:
             return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        if name.endswith(".output_scale") and ".q_proj" in name:
+            return name.replace(".q_proj.output_scale", ".attn.q_scale")
+        if name.endswith("self_attn.prob_output_scale"):
+            return name.replace(".prob_output_scale", ".attn.prob_scale")
+        # If no matches, return None
         return None
 
 
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 5d766c2c27ac..5dff8b09693c 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -38,6 +38,9 @@ def create_weights(self, layer: torch.nn.Module):
                                            requires_grad=False)
         layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0),
                                            requires_grad=False)
+        # Initialize P = softmax(QK^T) scales
+        layer.prob_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                              requires_grad=False)
 
     def apply(self, layer: torch.nn.Module) -> torch.Tensor:
         raise RuntimeError(
@@ -97,5 +100,38 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     "may cause accuracy issues. Please make sure k/v_scale "
                     "scaling factors are available in the fp8 checkpoint.")
 
+        if layer.q_scale > 0.0:
+            q_scale = layer.q_scale
+            if current_platform.is_fp8_fnuz():
+                q_scale *= 2
+            layer.calculate_kv_scales = False
+        else:
+            q_scale = 1.0
+        if layer.prob_scale > 0.0:
+            prob_scale = layer.prob_scale
+            if current_platform.is_fp8_fnuz():
+                prob_scale *= 2
+        else:
+            prob_scale = 1.0
+
+        is_singleton_float = lambda x: isinstance(x, float) or isinstance(
+            x, torch.Tensor) and x.numel() == 1 and x.is_floating_point()
+        if not is_singleton_float(q_scale) or not is_singleton_float(
+                prob_scale):
+            raise ValueError("Only support per-tensor scaling factor"
+                             "for fp8-quantized Q/prob")
+
+        # These are used in the final Attention.forward()
+        layer._q_scale.copy_(q_scale)
+        layer._prob_scale.copy_(prob_scale)
+        if q_scale == 1.0 or prob_scale == 1.0:
+            logger.warning_once(
+                f"Using Q scale {q_scale} and prob scale {prob_scale} "
+                "with fp8 attention. This may cause accuracy issues. "
+                "Please make sure Q/prob scaling factors are "
+                "available in the fp8 checkpoint.")
+
         del layer.k_scale
         del layer.v_scale
+        del layer.q_scale
+        del layer.prob_scale
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index ca71da8b736a..c4b3dcfbe8e6 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import fnmatch
-import re
 from typing import Any, Dict, List, Optional, cast
 
 import torch
@@ -125,6 +124,13 @@ def from_config(cls, config: Dict[str, Any]) -> "QuarkConfig":
             for q_config in q_configs:
                 q_config["output_tensors"] = None
 
+            # In case q_proj output is also quantized, remove the configuration
+            # to keep qkv consistency.
+            q_proj_q_config = cast(Dict[str, Any],
+                                   layer_quant_config.get("*q_proj"))
+            if q_proj_q_config is not None:
+                q_proj_q_config["output_tensors"] = None
+
         return cls(quant_config=config,
                    kv_cache_group=kv_cache_group,
                    kv_cache_config=kv_cache_config,
@@ -289,29 +295,30 @@ def get_cache_scale(self, name: str) -> Optional[str]:
         :param name: param name
         :return: matching param name for KV cache scale in vLLM
         """
-        if self.kv_cache_group is None or len(self.kv_cache_group) == 0:
-            return None
-
-        kv_proj_names = [
-            re.split(r"[*.]", kv_cache)[-1] for kv_cache in self.kv_cache_group
-        ]
-        if name.endswith(".output_scale"):
-            if len(kv_proj_names) == 1 and kv_proj_names[0] in name:
-                kv_output_scale_name = "." + kv_proj_names[0] + ".output_scale"
-                return name.replace(kv_output_scale_name, ".attn.k_scale")
-
-            elif len(kv_proj_names) == 2:
-                for kv_proj_name in kv_proj_names:
-                    if kv_proj_name in name and kv_proj_name == "k_proj":
-                        return name.replace(".k_proj.output_scale",
-                                            ".attn.k_scale")
-                    elif kv_proj_name in name and kv_proj_name == "v_proj":
-                        return name.replace(".v_proj.output_scale",
-                                            ".attn.v_scale")
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        if name.endswith(".output_scale") and ".q_proj" in name:
+            return name.replace(".q_proj.output_scale", ".attn.q_scale")
+        if name.endswith("self_attn.prob_output_scale"):
+            return name.replace(".prob_output_scale", ".attn.prob_scale")
 
         # If no matches, return None
         return None
 
+    def has_fp8_layer_weights(self):
+        layer_quant_config = self.quant_config.get("layer_quant_config")
+        to_dict = lambda obj: cast(Dict[str, Any], obj) or {}
+        return any([
+            'fp8' in cast(
+                str,
+                to_dict(
+                    to_dict(to_dict(layer_quant_config).get(layer_name)).get(
+                        "weight")).get("dtype"))
+            for layer_name in ["*v_proj", "*k_proj", "*q_proj"]
+        ])
+
 
 class QuarkLinearMethod(LinearMethodBase):
 

From ef19e67d2c8d275dd0a40492830aa3e26dd5e7c0 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Fri, 25 Apr 2025 16:13:13 +0800
Subject: [PATCH 75/81] [Doc] Add headings to improve gptqmodel.md (#17164)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/source/features/quantization/gptqmodel.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md
index 34adf6512b7e..0a1cb0c3d349 100644
--- a/docs/source/features/quantization/gptqmodel.md
+++ b/docs/source/features/quantization/gptqmodel.md
@@ -16,12 +16,16 @@ GPTQModel is one of the few quantization toolkits in the world that allows `Dyna
 is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
 for more details on this and other advanced features.
 
+## Installation
+
 You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq).
 
 ```console
 pip install -U gptqmodel --no-build-isolation -v
 ```
 
+## Quantizing a model
+
 After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
 
 Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
@@ -49,12 +53,16 @@ model.quantize(calibration_dataset, batch_size=2)
 model.save(quant_path)
 ```
 
+## Running a quantized model with vLLM
+
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
 
 ```console
 python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
 
+## Using GPTQModel with vLLM's Python API
+
 GPTQModel quantized models are also supported directly through the LLM entrypoint:
 
 ```python
@@ -67,14 +75,17 @@ prompts = [
     "The capital of France is",
     "The future of AI is",
 ]
+
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
 
 # Create an LLM.
 llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
+
 # Print the outputs.
 for output in outputs:
     prompt = output.prompt

From fc966e9cc6a6e01f642c721514de008a06a57203 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Fri, 25 Apr 2025 02:10:32 -0700
Subject: [PATCH 76/81] Only turn on FastIncrementalDetokenizer when tokenizers
 >= 0.21.1 (#17158)

---
 vllm/v1/engine/detokenizer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 330a3f6dad90..dca327cc5d07 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -2,6 +2,8 @@
 from abc import ABC, abstractmethod
 from typing import Optional
 
+import tokenizers
+from packaging import version
 from tokenizers import Tokenizer
 from tokenizers.decoders import DecodeStream
 from transformers import PreTrainedTokenizerFast
@@ -43,8 +45,10 @@ def from_new_request(
             # No tokenizer => skipping detokenization.
             return IncrementalDetokenizer()
 
-        if isinstance(tokenizer, PreTrainedTokenizerFast):
+        if (isinstance(tokenizer, PreTrainedTokenizerFast) and version.parse(
+                tokenizers.__version__) >= version.parse("0.21.1")):
             # Fast tokenizer => use tokenizers library DecodeStream.
+            # And only tokenizers >= 0.21.1 supports Fast Detokenizer.
             return FastIncrementalDetokenizer(tokenizer, request)
 
         # Fall back to slow python-based incremental detokenization.

From f851b8426659d00a264eaab5ef0f69f70e592e37 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Fri, 25 Apr 2025 18:23:57 +0800
Subject: [PATCH 77/81] [Doc] Add two links to disagg_prefill.md (#17168)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/source/features/disagg_prefill.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md
index 52d253b9c2b1..2fa20140c086 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/source/features/disagg_prefill.md
@@ -21,11 +21,11 @@ Disaggregated prefill DOES NOT improve throughput.
 
 ## Usage example
 
-Please refer to `examples/online_serving/disaggregated_prefill.sh` for the example usage of disaggregated prefilling.
+Please refer to <gh-file:examples/online_serving/disaggregated_prefill.sh> for the example usage of disaggregated prefilling.
 
 ## Benchmarks
 
-Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks.
+Please refer to <gh-file:benchmarks/disagg_benchmarks> for disaggregated prefilling benchmarks.
 
 ## Development
 

From 7feae92c1fa44a979e0d449927a0282e73030f19 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Fri, 25 Apr 2025 05:44:58 -0600
Subject: [PATCH 78/81] [Doc] Move todo out of beam search docstring (#17183)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
---
 vllm/entrypoints/llm.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 38a541a408fa..5fcf88f57a13 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -519,11 +519,9 @@ def beam_search(
             prompts: A list of prompts. Each prompt can be a string or a list
                 of token IDs.
             params: The beam search parameters.
-
-        TODO: how does beam search work together with length penalty, frequency
-        penalty, and stopping criteria, etc.?
         """
-
+        # TODO: how does beam search work together with length penalty,
+        # frequency, penalty, and stopping criteria, etc.?
         beam_width = params.beam_width
         max_tokens = params.max_tokens
         temperature = params.temperature

From 19dcc02a72e3ed52e3bf95aae44ea1f40ce42ea0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 25 Apr 2025 21:03:34 +0800
Subject: [PATCH 79/81] [Bugfix] Fix mistral model tests (#17181)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../decoder_only/language/test_mistral.py     | 64 +++++++++++--------
 .../tool_parsers/mistral_tool_parser.py       |  4 ++
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index ec885386dd94..79778072cc8b 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -10,8 +10,8 @@
 import jsonschema.exceptions
 import pytest
 
-from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
-    MistralToolParser)
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+    MistralToolCall, MistralToolParser)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 from ...utils import check_logprobs_close
@@ -194,7 +194,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
     )
 
 
-@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
@@ -246,10 +245,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
             assert "�" not in outputs[0].outputs[0].text.strip()
 
 
-@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("model",
-                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
 def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
     with vllm_runner(model,
                      dtype=dtype,
@@ -270,7 +267,8 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
         parsed_message = tool_parser.extract_tool_calls(model_output, None)
 
         assert parsed_message.tools_called
-        assert parsed_message.tool_calls[0].id == "0UAqFzWsD"
+
+        assert MistralToolCall.is_valid_id(parsed_message.tool_calls[0].id)
         assert parsed_message.tool_calls[
             0].function.name == "get_current_weather"
         assert parsed_message.tool_calls[
@@ -281,28 +279,38 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("guided_backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(vllm_runner, model: str,
-                                 guided_backend: str) -> None:
-    with vllm_runner(model, dtype='bfloat16',
-                     tokenizer_mode="mistral") as vllm_model:
+def test_mistral_guided_decoding(
+    monkeypatch: pytest.MonkeyPatch,
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with monkeypatch.context() as m:
+        # Guided JSON not supported in xgrammar + V1 yet
+        m.setenv("VLLM_USE_V1", "0")
 
-        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
-                                               backend=guided_backend)
-        params = SamplingParams(max_tokens=512,
-                                temperature=0.7,
-                                guided_decoding=guided_decoding)
-
-        messages = [{
-            "role": "system",
-            "content": "you are a helpful assistant"
-        }, {
-            "role":
-            "user",
-            "content":
-            f"Give an example JSON for an employee profile that "
-            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
-        }]
-        outputs = vllm_model.model.chat(messages, sampling_params=params)
+        with vllm_runner(
+                model,
+                dtype='bfloat16',
+                tokenizer_mode="mistral",
+                guided_decoding_backend=guided_backend,
+        ) as vllm_model:
+            guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA)
+            params = SamplingParams(max_tokens=512,
+                                    temperature=0.7,
+                                    guided_decoding=guided_decoding)
+
+            messages = [{
+                "role": "system",
+                "content": "you are a helpful assistant"
+            }, {
+                "role":
+                "user",
+                "content":
+                f"Give an example JSON for an employee profile that "
+                f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+            }]
+            outputs = vllm_model.model.chat(messages, sampling_params=params)
 
         generated_text = outputs[0].outputs[0].text
         json_response = json.loads(generated_text)
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index f0000daa0a41..9dbfe85ecc68 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -38,6 +38,10 @@ def generate_random_id():
         # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
         return "".join(choices(ALPHANUMERIC, k=9))
 
+    @staticmethod
+    def is_valid_id(id: str) -> bool:
+        return id.isalnum() and len(id) == 9
+
 
 @ToolParserManager.register_module("mistral")
 class MistralToolParser(ToolParser):

From d5615af9aee97ef44f46de722d48852eb5d40802 Mon Sep 17 00:00:00 2001
From: Jasmond L <120363110+JasmondL@users.noreply.github.com>
Date: Fri, 25 Apr 2025 22:26:30 +0800
Subject: [PATCH 80/81] [Bugfix] Fix Mistral ChatCompletionRequest Body
 Exception (#16769)

Signed-off-by: Jasmond Loh <Jasmond.Loh@hotmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 40 +++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index bd2c3357cdc0..4901d8f98dac 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1198,14 +1198,25 @@ def apply_hf_chat_template(
             "allowed, so you must provide a chat template if the tokenizer "
             "does not define one.")
 
-    return tokenizer.apply_chat_template(
-        conversation=conversation,  # type: ignore[arg-type]
-        tools=tools,  # type: ignore[arg-type]
-        chat_template=hf_chat_template,
-        tokenize=tokenize,
-        **kwargs,
-    )
+    try:
+
+        return tokenizer.apply_chat_template(
+            conversation=conversation,  # type: ignore[arg-type]
+            tools=tools,  # type: ignore[arg-type]
+            chat_template=hf_chat_template,
+            tokenize=tokenize,
+            **kwargs,
+        )
 
+    # External library exceptions can sometimes occur despite the framework's
+    # internal exception management capabilities.
+    except Exception as e:
+
+        # Log and report any library-related exceptions for further
+        # investigation.
+        logger.exception(
+            "An error occurred in `transformers` while applying chat template")
+        raise ValueError from e
 
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
@@ -1214,6 +1225,8 @@ def apply_mistral_chat_template(
     tools: Optional[list[dict[str, Any]]],
     **kwargs: Any,
 ) -> list[int]:
+    from mistral_common.exceptions import MistralCommonException
+
     # The return value of resolve_mistral_chat_template is always None,
     # and we won't use it.
     resolve_mistral_chat_template(
@@ -1231,5 +1244,16 @@ def apply_mistral_chat_template(
     # if input does not comply with the expected format.
     # We convert those assertion errors to ValueErrors so they can be
     # are properly caught in the preprocessing_input step
-    except AssertionError as e:
+    except (AssertionError, MistralCommonException) as e:
+        raise ValueError from e
+
+    # External library exceptions can sometimes occur despite the framework's
+    # internal exception management capabilities.
+    except Exception as e:
+
+        # Log and report any library-related exceptions for further
+        # investigation.
+        logger.exception(
+            "An error occurred in `mistral_common` while applying chat "
+            "template")
         raise ValueError from e

From a9e7a00feb7943cde94a0e569c716f402445ba1d Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Fri, 25 Apr 2025 18:57:10 +0000
Subject: [PATCH 81/81] Fix API typo and remove FP8 on V1 restriction

---
 vllm/attention/ops/prefix_prefill.py |  2 +-
 vllm/engine/arg_utils.py             | 17 -----------------
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 1da1bc1fa4d5..e8e3e1f521f6 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -38,11 +38,11 @@ def _fwd_kernel(Q,
                 V,
                 K_cache,
                 V_cache,
-                out_scale,
                 B_Loc,
                 sm_scale,
                 k_scale,
                 v_scale,
+                out_scale,
                 B_Start_Loc,
                 B_Seqlen,
                 x: tl.constexpr,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 326bea0cb6a3..68bea93b3354 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1368,23 +1368,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        if current_platform.is_rocm():
-            from vllm.model_executor.layers.quantization.fp8 import Fp8Config
-            load_config = self.create_load_config()
-            quantization_config = VllmConfig.get_quantization_config(
-                model_config, load_config)
-            if isinstance(quantization_config, Fp8Config):
-                _raise_or_fallback(feature_name="fp8 for ROCm",
-                                   recommend_to_remove=False)
-                return False
-            from vllm.model_executor.layers.quantization.quark.quark import (
-                QuarkConfig)
-
-            if isinstance(quantization_config, QuarkConfig
-                          ) and quantization_config.has_fp8_layer_weights():
-                _raise_or_fallback(feature_name="Quark fp8 for ROCm",
-                                   recommend_to_remove=False)
-
         # No Fp8 KV cache so far.
         if self.kv_cache_dtype != "auto":
             fp8_attention = self.kv_cache_dtype.startswith("fp8")