From 69068cdd5b420ffb7e3926a6a4c6575efeea1b59 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Tue, 12 Aug 2025 20:43:22 -0400
Subject: [PATCH 01/43] chore: finalize cleanup from v0

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 .../benchmark_serving_structured_output.py    |  16 +-
 docs/features/reasoning_outputs.md            |  12 +-
 docs/features/structured_outputs.md           |  36 ++--
 docs/features/tool_calling.md                 |  11 +-
 docs/serving/openai_compatible_server.md      |   4 +-
 .../offline_inference/structured_outputs.py   |  57 +++---
 ...t_completion_client_with_tools_required.py |   2 +-
 .../structured_outputs/structured_outputs.py  |   8 +-
 tests/async_engine/test_async_llm_engine.py   |   1 -
 tests/entrypoints/conftest.py                 |   2 +-
 tests/entrypoints/llm/test_lazy_outlines.py   |  82 --------
 tests/entrypoints/openai/test_chat.py         |  18 +-
 tests/entrypoints/openai/test_completion.py   |  78 ++++---
 tests/entrypoints/openai/test_serving_chat.py |   4 -
 tests/test_sampling_params.py                 |  84 --------
 tests/tool_use/test_tool_choice_required.py   |  11 +-
 tests/v1/core/test_scheduler.py               |   6 +-
 tests/v1/engine/test_llm_engine.py            |   4 +-
 tests/v1/entrypoints/conftest.py              |   2 +-
 .../llm/test_struct_output_generate.py        |  92 +++++----
 vllm/config/__init__.py                       |  35 ++--
 vllm/engine/arg_utils.py                      |  64 ++----
 vllm/engine/async_llm_engine.py               |   8 +-
 vllm/engine/llm_engine.py                     |  11 +-
 vllm/engine/multiprocessing/client.py         |   6 +-
 vllm/engine/protocol.py                       |   7 +-
 vllm/entrypoints/llm.py                       |  19 +-
 vllm/entrypoints/openai/protocol.py           | 192 +++++++-----------
 vllm/model_executor/models/config.py          |   6 +-
 vllm/sampling_params.py                       |  28 +--
 vllm/transformers_utils/tokenizers/mistral.py |   3 -
 vllm/v1/engine/async_llm.py                   |   3 -
 vllm/v1/engine/processor.py                   |  39 +---
 vllm/v1/request.py                            |   6 +-
 vllm/v1/structured_output/__init__.py         |   6 +-
 vllm/v1/structured_output/backend_guidance.py |   4 +-
 vllm/v1/structured_output/backend_outlines.py |  12 +-
 vllm/v1/structured_output/backend_xgrammar.py |   6 +-
 vllm/v1/structured_output/request.py          |   2 +-
 39 files changed, 357 insertions(+), 630 deletions(-)
 delete mode 100644 tests/entrypoints/llm/test_lazy_outlines.py
 delete mode 100644 tests/test_sampling_params.py

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index ca6843a72aa3..28821aa4ab73 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -696,11 +696,11 @@ def _eval_correctness_regex(expected, actual):
         return re.match(args.regex, actual) is not None
 
     def _eval_correctness(expected, actual):
-        if args.structure_type == "guided_json":
+        if args.structure_type == "json":
             return _eval_correctness_json(expected, actual)
-        elif args.structure_type == "guided_regex":
+        elif args.structure_type == "regex":
             return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == "guided_choice":
+        elif args.structure_type == "choice":
             return _eval_correctness_choice(expected, actual)
         else:
             return None
@@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
     )
 
     if args.dataset == "grammar":
-        args.structure_type = "guided_grammar"
+        args.structure_type = "grammar"
     elif args.dataset == "regex":
-        args.structure_type = "guided_regex"
+        args.structure_type = "regex"
     elif args.dataset == "choice":
-        args.structure_type = "guided_choice"
+        args.structure_type = "choice"
     else:
-        args.structure_type = "guided_json"
+        args.structure_type = "json"
 
     if args.no_structured_output:
         args.structured_output_ratio = 0
     if args.save_results:
-        result_file_name = f"{args.structured_output_ratio}guided"
+        result_file_name = f"{args.structured_output_ratio}so"
         result_file_name += f"_{backend}"
         result_file_name += f"_{args.request_rate}qps"
         result_file_name += f"_{args.model.split('/')[-1]}"
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 04b943efbbbb..3c66f4bd57df 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -1,3 +1,7 @@
+---
+title: reasoning_outputs
+---
+
 # Reasoning Outputs
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
@@ -10,11 +14,11 @@ vLLM currently supports the following reasoning models:
 
 | Model Series | Parser Name | Structured Output Support | Tool Calling |
 |--------------|-------------|------------------|-------------|
-| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
-| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
-| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
-| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ |
+| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
+| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
 
 !!! note
     IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 8a934d406f38..c99a54197421 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
 
 The following parameters are supported, which must be added as extra parameters:
 
-- `guided_choice`: the output will be exactly one of the choices.
-- `guided_regex`: the output will follow the regex pattern.
-- `guided_json`: the output will follow the JSON schema.
-- `guided_grammar`: the output will follow the context free grammar.
+- `choice`: the output will be exactly one of the choices.
+- `regex`: the output will follow the regex pattern.
+- `json`: the output will follow the JSON schema.
+- `grammar`: the output will follow the context free grammar.
 - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
 
 You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
 
 Structured outputs are supported by default in the OpenAI-Compatible Server. You
 may choose to specify the backend to use by setting the
-`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`,
+`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
 which will try to choose an appropriate backend based on the details of the
 request. You may also choose a specific backend, along with
 some options. A full set of options is available in the `vllm serve --help`
 text.
 
-Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
+Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
 
 ??? code
 
@@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
         messages=[
             {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
         ],
-        extra_body={"guided_choice": ["positive", "negative"]},
+        extra_body={"structured_outputs": {"choices": ["positive", "negative"]}},
     )
     print(completion.choices[0].message.content)
     ```
 
-The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
+The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
 
 ??? code
 
@@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
                 "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
             }
         ],
-        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
+        extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
     )
     print(completion.choices[0].message.content)
     ```
 
 One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
-For this we can use the `guided_json` parameter in two different ways:
+For this we can use the `json` parameter in two different ways:
 
 - Using directly a [JSON Schema](https://json-schema.org/)
 - Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
 
-The next example shows how to use the `guided_json` parameter with a Pydantic model:
+The next example shows how to use the `response_format` parameter with a Pydantic model:
 
 ??? code
 
@@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
     JSON schema and how the fields should be populated. This can improve the
     results notably in most cases.
 
-Finally we have the `guided_grammar` option, which is probably the most
+Finally we have the `grammar` option, which is probably the most
 difficult to use, but it´s really powerful. It allows us to define complete
 languages like SQL queries. It works by using a context free EBNF grammar.
 As an example, we can use to define a specific format of simplified SQL queries:
@@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
                 "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
             }
         ],
-        extra_body={"guided_grammar": simplified_sql_grammar},
+        extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
     )
     print(completion.choices[0].message.content)
     ```
@@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
 ## Offline Inference
 
 Offline inference allows for the same types of structured outputs.
-To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`.
-The main available options inside `GuidedDecodingParams` are:
+To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
+The main available options inside `StructuredOutputsParams` are:
 
 - `json`
 - `regex`
@@ -309,12 +309,12 @@ shown below:
 
     ```python
     from vllm import LLM, SamplingParams
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
 
     llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
 
-    guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
-    sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
+    structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
+    sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
     outputs = llm.generate(
         prompts="Classify this sentiment: vLLM is wonderful!",
         sampling_params=sampling_params,
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 37d502ef9ce0..9198a9edd8e4 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -71,7 +71,7 @@ This example demonstrates:
 * Making a request with `tool_choice="auto"`
 * Handling the structured response and executing the corresponding function
 
-You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
+You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
 
 Remember that it's the caller's responsibility to:
 
@@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
 
 ## Named Function Calling
 
-vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is
-enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
+vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backend supported by vLLM. You are guaranteed a validly-parsable function call - not a
 high-quality one.
 
-vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
-For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend.
+vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
+For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
 
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
 ## Required Function Calling
 
-vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
 
 When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
 
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index dfed15d4ace9..ec4a1a7004a3 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -133,7 +133,7 @@ completion = client.chat.completions.create(
         {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
     ],
     extra_body={
-        "guided_choice": ["positive", "negative"]
+        "structured_outputs": {"choices": ["positive", "negative"]}
     }
 )
 ```
@@ -374,7 +374,7 @@ The following extra parameters are supported:
     ```python
     --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
     ```
-  
+
 [](){ #translations-api }
 
 ### Translations API
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 8ef121ebe848..e14097c9ab04 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,11 +1,11 @@
+# ruff: noqa: E501
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-This file demonstrates the example usage of guided decoding
-to generate structured outputs using vLLM. It shows how to apply
-different guided decoding techniques such as Choice, Regex, JSON schema,
-and Grammar to produce structured and formatted results
-based on specific prompts.
+This file demonstrates the example usage of structured outputs
+in vLLM. It shows how to apply different constraints such as choice,
+regex, json schema, and grammar to produce structured and formatted
+results based on specific prompts.
 """
 
 from enum import Enum
@@ -13,17 +13,21 @@
 from pydantic import BaseModel
 
 from vllm import LLM, SamplingParams
-from vllm.sampling_params import GuidedDecodingParams
+from vllm.sampling_params import StructuredOutputsParams
 
-# Guided decoding by Choice (list of possible options)
-guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
-sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
+# Structured outputs by Choice (list of possible options)
+structured_outputs_params_choice = StructuredOutputsParams(
+    choice=["Positive", "Negative"]
+)
+sampling_params_choice = SamplingParams(
+    structured_outputs=structured_outputs_params_choice
+)
 prompt_choice = "Classify this sentiment: vLLM is wonderful!"
 
-# Guided decoding by Regex
-guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
+# Structured outputs by Regex
+structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
 sampling_params_regex = SamplingParams(
-    guided_decoding=guided_decoding_params_regex, stop=["\n"]
+    structured_outputs=structured_outputs_params_regex, stop=["\n"]
 )
 prompt_regex = (
     "Generate an email address for Alan Turing, who works in Enigma."
@@ -32,7 +36,7 @@
 )
 
 
-# Guided decoding by JSON using Pydantic schema
+# Structured outputs by JSON using Pydantic schema
 class CarType(str, Enum):
     sedan = "sedan"
     suv = "SUV"
@@ -47,14 +51,11 @@ class CarDescription(BaseModel):
 
 
 json_schema = CarDescription.model_json_schema()
-guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
-sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json)
-prompt_json = (
-    "Generate a JSON with the brand, model and car_type of"
-    "the most iconic car from the 90's"
-)
+structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
+sampling_params_json = SamplingParams(structured_outputs=structured_outputs_params_json)
+prompt_json = "Generate a JSON with the brand, model and car_type ofthe most iconic car from the 90's"
 
-# Guided decoding by Grammar
+# Structured outputs by Grammar
 simplified_sql_grammar = """
 root ::= select_statement
 select_statement ::= "SELECT " column " from " table " where " condition
@@ -63,8 +64,12 @@ class CarDescription(BaseModel):
 condition ::= column "= " number
 number ::= "1 " | "2 "
 """
-guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
-sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar)
+structured_outputs_params_grammar = StructuredOutputsParams(
+    grammar=simplified_sql_grammar
+)
+sampling_params_grammar = SamplingParams(
+    structured_outputs=structured_outputs_params_grammar
+)
 prompt_grammar = (
     "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
 )
@@ -83,16 +88,16 @@ def main():
     llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
 
     choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
-    format_output("Guided decoding by Choice", choice_output)
+    format_output("Structured outputs by Choice", choice_output)
 
     regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
-    format_output("Guided decoding by Regex", regex_output)
+    format_output("Structured outputs by Regex", regex_output)
 
     json_output = generate_output(prompt_json, sampling_params_json, llm)
-    format_output("Guided decoding by JSON", json_output)
+    format_output("Structured outputs by JSON", json_output)
 
     grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
-    format_output("Guided decoding by Grammar", grammar_output)
+    format_output("Structured outputs by Grammar", grammar_output)
 
 
 if __name__ == "__main__":
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 7eb8668213ee..6ff65b18f667 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -6,7 +6,7 @@
 
 ```bash
 VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
-    --guided-decoding-backend outlines
+    --structured-outputs-config.backend outlines
 ```
 
 This example demonstrates how to generate chat completions
diff --git a/examples/online_serving/structured_outputs/structured_outputs.py b/examples/online_serving/structured_outputs/structured_outputs.py
index 2a8f4637260c..3ea6c73e90e8 100644
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -86,7 +86,7 @@ class CarDescription(pydantic.BaseModel):
                 "content": "Classify this sentiment: vLLM is wonderful!",
             }
         ],
-        "extra_body": {"guided_choice": ["positive", "negative"]},
+        "extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
     },
     "regex": {
         "messages": [
@@ -96,7 +96,7 @@ class CarDescription(pydantic.BaseModel):
             }
         ],
         "extra_body": {
-            "guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n",
+            "structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
         },
     },
     "json": {
@@ -122,7 +122,8 @@ class CarDescription(pydantic.BaseModel):
             }
         ],
         "extra_body": {
-            "guided_grammar": """
+            "structured_outputs": {
+                "grammar": """
 root ::= select_statement
 
 select_statement ::= "SELECT " column " from " table " where " condition
@@ -135,6 +136,7 @@ class CarDescription(pydantic.BaseModel):
 
 number ::= "1 " | "2 "
 """,
+            }
         },
     },
     "structural_tag": {
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 0eb7a6eb52aa..2eb8f924b247 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -128,7 +128,6 @@ async def test_new_requests_event():
     engine = MockAsyncLLMEngine()
     assert engine.get_model_config() is not None
     assert engine.get_tokenizer() is not None
-    assert engine.get_decoding_config() is not None
 
 
 def start_engine():
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index a7c533ec2419..bf1ac2a5a5e9 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -184,7 +184,7 @@ def sample_enum_json_schema():
 
 
 @pytest.fixture
-def sample_guided_choice():
+def sample_choices():
     return [
         "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
         "Ruby", "Swift", "Kotlin"
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
deleted file mode 100644
index ac0b7e134c55..000000000000
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import sys
-from contextlib import nullcontext
-
-from vllm_test_utils import BlameResult, blame
-
-from vllm import LLM, SamplingParams
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.sampling_params import GuidedDecodingParams
-
-
-def run_normal():
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="distilbert/distilgpt2",
-              enforce_eager=True,
-              gpu_memory_utilization=0.3)
-    outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-    # Destroy the LLM object and free up the GPU memory.
-    del llm
-    cleanup_dist_env_and_memory()
-
-
-def run_xgrammar(sample_regex):
-    # Create an LLM with guided decoding enabled.
-    llm = LLM(model="distilbert/distilgpt2",
-              enforce_eager=True,
-              guided_decoding_backend="xgrammar",
-              gpu_memory_utilization=0.3)
-    prompt = f"Give an example IPv4 address with this regex: {sample_regex}"
-    guided_decoding = GuidedDecodingParams(regex=sample_regex)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=guided_decoding)
-    outputs = llm.generate(
-        prompts=[prompt] * 2,
-        sampling_params=sampling_params,
-        use_tqdm=True,
-    )
-
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def test_lazy_outlines(sample_regex):
-    """If users don't use guided decoding, outlines should not be imported.
-    """
-    # make sure outlines is not imported
-    module_name = "outlines"
-    # In CI, we only check finally if the module is imported.
-    # If it is indeed imported, we can rerun the test with `use_blame=True`,
-    # which will trace every function call to find the first import location,
-    # and help find the root cause.
-    # We don't run it in CI by default because it is slow.
-    use_blame = False
-    context = blame(
-        lambda: module_name in sys.modules) if use_blame else nullcontext()
-    with context as result:
-        run_normal()
-        run_xgrammar(sample_regex)
-    if use_blame:
-        assert isinstance(result, BlameResult)
-        print(f"the first import location is:\n{result.trace_stack}")
-    assert module_name not in sys.modules, (
-        f"Module {module_name} is imported. To see the first"
-        f" import location, run the test with `use_blame=True`.")
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 5ad29d70f10d..2ccb0beb7709 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -487,8 +487,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  sample_guided_choice, is_v1_server: bool):
+async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices,
+                                  is_v1_server: bool):
     if not is_v1_server:
         pytest.skip("Guided decoding is only supported in v1 engine")
     messages = [{
@@ -505,9 +505,9 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(guided_choice=sample_choices))
     choice1 = chat_completion.choices[0].message.content
-    assert choice1 in sample_guided_choice
+    assert choice1 in sample_choices
 
     messages.append({"role": "assistant", "content": choice1})
     messages.append({
@@ -519,9 +519,9 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(guided_choice=sample_choices))
     choice2 = chat_completion.choices[0].message.content
-    assert choice2 in sample_guided_choice
+    assert choice2 in sample_choices
     assert choice1 != choice2
 
 
@@ -609,7 +609,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
 
 
 @pytest.mark.asyncio
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
+async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -631,7 +631,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           sample_guided_choice):
+                                           sample_choices):
 
     messages = [{
         "role": "system",
@@ -648,7 +648,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=sample_guided_choice))
+        extra_body=dict(guided_choice=sample_choices))
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 74ef6deeea16..7aeeef7f0d9d 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# imports for guided decoding tests
+# imports for structured outputs tests
 import json
 import os
 import shutil
@@ -28,8 +28,6 @@
 # but we're not testing generation quality here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
-GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
-
 
 @pytest.fixture(scope="module")
 def zephyr_lora_files():
@@ -636,12 +634,13 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_completion(client: openai.AsyncOpenAI,
-                                      guided_decoding_backend: str,
-                                      sample_json_schema, is_v1_server: bool):
+async def test_structured_outputs_json_completion(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+    is_v1_server: bool,
+):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("structured outputs is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -650,8 +649,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
         n=3,
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(structured_outputs=dict(json=sample_json_schema)))
 
     assert completion.id is not None
     assert len(completion.choices) == 3
@@ -661,12 +659,13 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_completion(client: openai.AsyncOpenAI,
-                                       guided_decoding_backend: str,
-                                       sample_regex, is_v1_server: bool):
+async def test_structured_outputs_regex_completion(
+    client: openai.AsyncOpenAI,
+    sample_regex,
+    is_v1_server: bool,
+):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("structured outputs is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -674,8 +673,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
         n=3,
         temperature=1.0,
         max_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(structured_outputs=dict(regex=sample_regex)))
 
     assert completion.id is not None
     assert len(completion.choices) == 3
@@ -685,13 +683,13 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_choice_completion(client: openai.AsyncOpenAI,
-                                        guided_decoding_backend: str,
-                                        sample_guided_choice,
-                                        is_v1_server: bool):
+async def test_structured_outputs_choice_completion(
+    client: openai.AsyncOpenAI,
+    sample_choices,
+    is_v1_server: bool,
+):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("structured outputs is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -699,20 +697,20 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI,
         n=2,
         temperature=1.0,
         max_tokens=10,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(structured_outputs=dict(choice=sample_choices)))
 
     assert completion.id is not None
     assert len(completion.choices) == 2
     for i in range(2):
-        assert completion.choices[i].text in sample_guided_choice
+        assert completion.choices[i].text in sample_choices
 
 
 @pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI,
-                              sample_sql_statements, is_v1_server: bool):
+async def test_structured_outputs_grammar(client: openai.AsyncOpenAI,
+                                          sample_sql_statements,
+                                          is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided grammar is only supported in v1 engine")
+        pytest.skip("grammar is only supported in v1 engine")
 
     completion = await client.completions.create(
         model=MODEL_NAME,
@@ -720,7 +718,8 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
                 "table_1 where it is equals to 1"),
         temperature=1.0,
         max_tokens=500,
-        extra_body=dict(guided_grammar=sample_sql_statements))
+        extra_body=dict(
+            structured_outputs=dict(grammar=sample_sql_statements), ))
 
     content = completion.choices[0].text
 
@@ -771,27 +770,26 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
-                                          guided_decoding_backend: str,
-                                          sample_json_schema, sample_regex,
-                                          is_v1_server: bool):
+async def test_structured_outputs_type_error(client: openai.AsyncOpenAI,
+                                             sample_json_schema, sample_regex,
+                                             is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("structured outputs is only supported in v1 engine")
 
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
             prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(guided_json=42,
-                            guided_decoding_backend=guided_decoding_backend))
+            extra_body=dict(structured_outputs=dict(json=42)))
 
     with pytest.raises(openai.BadRequestError):
         _ = await client.completions.create(
             model=MODEL_NAME,
             prompt="Give an example string that fits this regex",
-            extra_body=dict(guided_regex=sample_regex,
-                            guided_json=sample_json_schema))
+            extra_body=dict(structured_outputs=dict(
+                regex=sample_regex,
+                json=sample_json_schema,
+            )))
 
 
 @pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 8a7892cf6d6a..76a02ad692fd 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -98,7 +98,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
             "role": "user",
             "content": "what is 1+1?"
         }],
-        guided_decoding_backend="outlines",
     )
 
     with suppress(Exception):
@@ -143,7 +142,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
             "role": "user",
             "content": "what is 1+1?"
         }],
-        guided_decoding_backend="outlines",
     )
 
     with suppress(Exception):
@@ -198,7 +196,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
             "role": "user",
             "content": "what is 1+1?"
         }],
-        guided_decoding_backend="outlines",
     )
 
     with suppress(Exception):
@@ -254,7 +251,6 @@ async def test_serving_chat_could_load_correct_generation_config():
             "role": "user",
             "content": "what is 1+1?"
         }],
-        guided_decoding_backend="outlines",
     )
 
     with suppress(Exception):
diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py
deleted file mode 100644
index 7330f61e6768..000000000000
--- a/tests/test_sampling_params.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for the SamplingParams class.
-"""
-
-import pytest
-
-from vllm import SamplingParams
-from vllm.config import ModelConfig
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest
-
-MODEL_NAME = "Qwen/Qwen1.5-7B"
-
-
-def test_max_tokens_none():
-    """max_tokens=None should be allowed"""
-    SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
-
-
-@pytest.fixture(scope="module")
-def model_config():
-    return ModelConfig(
-        MODEL_NAME,
-        seed=0,
-        dtype="float16",
-    )
-
-
-@pytest.fixture(scope="module")
-def default_max_tokens():
-    return 4096
-
-
-def test_sampling_params_from_request_with_no_guided_decoding_backend(
-        model_config, default_max_tokens):
-    # guided_decoding_backend is not present at request level
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        MODEL_NAME,
-        'response_format': {
-            'type': 'json_object',
-        },
-    })
-
-    sampling_params = request.to_sampling_params(
-        default_max_tokens,
-        model_config.logits_processor_pattern,
-    )
-    # we do not expect any backend to be present and the default
-    # guided_decoding_backend at engine level will be used.
-    assert sampling_params.guided_decoding.backend is None
-
-
-@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
-                         [("xgrammar", "xgrammar"), ("guidance", "guidance"),
-                          ("outlines", "outlines")])
-def test_sampling_params_from_request_with_guided_decoding_backend(
-        request_level_guided_decoding_backend: str, expected: str,
-        model_config, default_max_tokens):
-
-    request = ChatCompletionRequest.model_validate({
-        'messages': [{
-            'role': 'user',
-            'content': 'Hello'
-        }],
-        'model':
-        MODEL_NAME,
-        'response_format': {
-            'type': 'json_object',
-        },
-        'guided_decoding_backend':
-        request_level_guided_decoding_backend,
-    })
-
-    sampling_params = request.to_sampling_params(
-        default_max_tokens,
-        model_config.logits_processor_pattern,
-    )
-    # backend correctly identified in resulting sampling_params
-    assert sampling_params.guided_decoding.backend == expected
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index e0ed221a93e1..130e9547bdcc 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -68,7 +68,7 @@
 def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
                        should_match: bool):
     self = MagicMock(tool_choice="required", tools=tools)
-    schema = ChatCompletionRequest._get_guided_json_from_tool(self)
+    schema = ChatCompletionRequest._get_json_schema_from_tool(self)
     assert isinstance(schema, dict)
 
     # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
@@ -218,7 +218,7 @@ def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
                 }
             }, {}], False),
     ])
-def test_guided_json(sample_output, should_match):
+def test_structured_outputs_json(sample_output, should_match):
     _compile_and_check(tools=TypeAdapter(
         list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
                        sample_output=sample_output,
@@ -273,8 +273,9 @@ def update_parameters_empty_dict(
 @pytest.mark.parametrize(
     "update_parameters",
     [update_parameters_none, update_parameters_empty_dict])
-def test_guided_json_without_parameters(sample_output, should_match,
-                                        update_parameters):
+def test_structured_outputs_json_without_parameters(sample_output,
+                                                    should_match,
+                                                    update_parameters):
     updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
     tools = TypeAdapter(
         list[ChatCompletionToolsParam]).validate_python(updated_tools)
@@ -334,4 +335,4 @@ def test_streaming_output_valid(output, empty_params, delta_len):
             combined_messages += message.tool_calls[0].function.arguments
     combined_messages += "}]"
     assert json.loads(combined_messages) == output
-    assert json.dumps(json.loads(combined_messages)) == output_json
\ No newline at end of file
+    assert json.dumps(json.loads(combined_messages)) == output_json
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c719d1975bba..fec94b7f32fa 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -9,7 +9,7 @@
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -1807,11 +1807,11 @@ def test_schedule_skip_tokenizer_init():
 
 def test_schedule_skip_tokenizer_init_structured_output_request():
     scheduler = create_scheduler(skip_tokenizer_init=True)
-    guided_params = GuidedDecodingParams(regex="[0-9]+")
+    structured_outputs_params = StructuredOutputsParams(regex="[0-9]+")
     sampling_params = SamplingParams(
         ignore_eos=False,
         max_tokens=16,
-        guided_decoding=guided_params,
+        structured_outputs=structured_outputs_params,
     )
     request = Request(
         request_id="0",
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 2848420c2208..7529c3780ec2 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -8,7 +8,7 @@
 import pytest
 
 from vllm import LLM
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 
 if TYPE_CHECKING:
@@ -97,7 +97,7 @@ def get_mostly_n_gt1() -> int:
             top_p=0.95,
             n=n,
             seed=seed,
-            guided_decoding=GuidedDecodingParams(
+            structured_outputs=StructuredOutputsParams(
                 regex="[0-9]+") if structured_outputs else None,
         ) for n in n_list
     ], n_list
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index ffe061212466..08d50e3fc928 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -151,7 +151,7 @@ def sample_definition_json_schema():
 
 
 @pytest.fixture
-def sample_guided_choice():
+def sample_choices():
     return [
         "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
         "Ruby", "Swift", "Kotlin"
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 8bddfb0b48a5..3e6fdc6ee3e5 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -18,7 +18,7 @@
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
-from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 
 if TYPE_CHECKING:
     from vllm.config import TokenizerMode
@@ -85,7 +85,7 @@ def _load_json(s: str, backend: str) -> str:
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
-    "model_name, guided_decoding_backend, tokenizer_mode, speculative_config",
+    "model_name, backend, tokenizer_mode, speculative_config",
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
 def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
@@ -94,8 +94,8 @@ def test_structured_output(
     sample_sql_ebnf: str,
     sample_sql_lark: str,
     sample_regex: str,
-    sample_guided_choice: str,
-    guided_decoding_backend: str,
+    sample_choices: str,
+    backend: str,
     tokenizer_mode: str,
     model_name: str,
     speculative_config: dict[str, Any],
@@ -110,15 +110,13 @@ def test_structured_output(
     enforce_eager = bool(not current_platform.is_tpu())
     # Use a single LLM instance for several scenarios to
     # speed up the test suite.
-    llm = LLM(
-        model=model_name,
-        enforce_eager=enforce_eager,
-        max_model_len=1024,
-        guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=(guided_decoding_backend
-                                                in {"xgrammar", "guidance"}),
-        tokenizer_mode=tokenizer_mode,
-        speculative_config=speculative_config)
+    llm = LLM(model=model_name,
+              enforce_eager=enforce_eager,
+              max_model_len=1024,
+              structured_outputs_config=dict(
+                  disable_any_whitespace=backend in {"xgrammar", "guidance"}),
+              tokenizer_mode=tokenizer_mode,
+              speculative_config=speculative_config)
 
     #
     # Test 1: Generate JSON output based on a provided schema
@@ -126,7 +124,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+        structured_outputs=StructuredOutputsParams(json=sample_json_schema))
     outputs = llm.generate(prompts=[
         (f"Give an example JSON for an employee profile that fits this "
          f"schema. Make the response as short as possible. Schema: "
@@ -152,12 +150,12 @@ def test_structured_output(
     #
     # Test 2: Generate JSON object without a schema
     #
-    if guided_decoding_backend != "outlines":
+    if backend != "outlines":
         sampling_params = SamplingParams(
             temperature=1.0,
             max_tokens=4096,
             n=2,
-            guided_decoding=GuidedDecodingParams(json_object=True))
+            structured_outputs=StructuredOutputsParams(json_object=True))
 
         outputs = llm.generate(prompts=(
             "Generate a JSON object with curly braces for a person with "
@@ -186,8 +184,9 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
-    if guided_decoding_backend.startswith("xgrammar"):
+        structured_outputs=StructuredOutputsParams(
+            json=unsupported_json_schema))
+    if backend.startswith("xgrammar"):
         with pytest.raises(ValueError,
                            match="The provided JSON schema contains features "
                            "not supported by xgrammar."):
@@ -217,7 +216,7 @@ def test_structured_output(
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
 
-    if guided_decoding_backend != "outlines":
+    if backend != "outlines":
         #
         # Test 4: Generate SQL statement using EBNF grammar
         #
@@ -225,7 +224,8 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
+            structured_outputs=StructuredOutputsParams(
+                grammar=sample_sql_ebnf))
         outputs = llm.generate(
             prompts=(
                 "Generate a sql statement that selects col_1 from "
@@ -259,7 +259,8 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
+            structured_outputs=StructuredOutputsParams(
+                grammar=sample_sql_lark))
         outputs = llm.generate(
             prompts=(
                 "Generate a sql statement that selects col_1 from "
@@ -298,7 +299,8 @@ def test_structured_output(
             temperature=0.8,
             top_p=0.95,
             max_tokens=1000,
-            guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
+            structured_outputs=StructuredOutputsParams(
+                grammar="not a grammar"))
         with pytest.raises(ValueError, match="Failed to convert the grammar "):
             llm.generate(
                 prompts=
@@ -315,7 +317,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+        structured_outputs=StructuredOutputsParams(regex=sample_regex))
     outputs = llm.generate(
         prompts=[
             (f"Give an example IPv4 address with this regex: {sample_regex}. "
@@ -342,7 +344,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+        structured_outputs=StructuredOutputsParams(choice=sample_choices))
     outputs = llm.generate(
         prompts=("The best language for type-safe systems programming is "
                  "(Make the response as short as possible.) "),
@@ -356,7 +358,7 @@ def test_structured_output(
         generated_text = output.outputs[0].text
         print(generated_text)
         assert generated_text is not None
-        assert generated_text in sample_guided_choice
+        assert generated_text in sample_choices
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
     #
@@ -366,7 +368,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=json_schema))
+        structured_outputs=StructuredOutputsParams(json=json_schema))
     outputs = llm.generate(prompts=(
         "Generate a JSON with the brand, model and car_type of the most "
         "iconic car from the 90's. Make the response as short as "
@@ -408,7 +410,7 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=4096,
-        guided_decoding=GuidedDecodingParams(json=json_schema))
+        structured_outputs=StructuredOutputsParams(json=json_schema))
 
     outputs = llm.generate(
         prompts=("Generate a description of a frog using 50 characters. "
@@ -429,7 +431,7 @@ def test_structured_output(
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=json_schema)
 
-    if guided_decoding_backend != "outlines":
+    if backend != "outlines":
         #
         # Test 11: Generate structured output using structural_tag format
         #
@@ -455,7 +457,7 @@ def test_structured_output(
         sampling_params = SamplingParams(
             temperature=0.0,
             max_tokens=4096,
-            guided_decoding=GuidedDecodingParams(
+            structured_outputs=StructuredOutputsParams(
                 structural_tag=json.dumps(structural_tag_config)))
 
         prompt = """
@@ -532,7 +534,7 @@ def test_structured_output(
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
-    "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
+    "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
     [
         ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto",
          "deepseek_r1", NGRAM_SPEC_CONFIG),
@@ -541,7 +543,7 @@ def test_structured_output(
 )
 def test_structured_output_with_reasoning_matrices(
     monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
+    backend: str,
     tokenizer_mode: TokenizerMode,
     reasoning_parser: str,
     model_name: str,
@@ -561,10 +563,10 @@ def test_structured_output_with_reasoning_matrices(
         enforce_eager=bool(not current_platform.is_tpu()),
         max_model_len=1024,
         max_num_seqs=16,
-        guided_decoding_backend=guided_decoding_backend,
-        guided_decoding_disable_any_whitespace=True,
+        backend=backend,
+        structured_outputs_config=dict(disable_any_whitespace=True,
+                                       reasoning_backend=reasoning_parser),
         tokenizer_mode=tokenizer_mode,
-        reasoning_parser=reasoning_parser,
         speculative_config=speculative_config,
     )
     tokenizer = llm.get_tokenizer(None)
@@ -588,7 +590,7 @@ def test_structured_output_with_reasoning_matrices(
     sampling_params = SamplingParams(
         temperature=0.1,
         max_tokens=8192,
-        guided_decoding=GuidedDecodingParams(json=reasoning_schema),
+        structured_outputs=StructuredOutputsParams(json=reasoning_schema),
     )
     outputs = llm.generate(
         [reasoning_prompt],
@@ -625,13 +627,14 @@ def test_structured_output_auto_mode(
 
     llm = LLM(model=model_name,
               max_model_len=1024,
-              guided_decoding_backend="auto",
+              backend="auto",
               tokenizer_mode=tokenizer_mode)
 
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+        structured_outputs=StructuredOutputsParams(
+            json=unsupported_json_schema))
 
     prompts = (
         "Give an example JSON object for a grade "
@@ -668,9 +671,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
 
     llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
               max_model_len=1024,
-              guided_decoding_backend="guidance",
-              guided_decoding_disable_any_whitespace=True,
-              guided_decoding_disable_additional_properties=True)
+              structured_outputs_config=dict(
+                  disable_any_whitespace=True,
+                  disable_additional_properties=True))
 
     schema = {
         'type': 'object',
@@ -696,14 +699,15 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
         "<|im_end|>\n<|im_start|>assistant\n")
 
     def generate_with_backend(backend):
-        guided_params = GuidedDecodingParams(
+        structured_outputs_params = StructuredOutputsParams(
             json=schema,
             backend=backend,
             disable_any_whitespace=True,
             disable_additional_properties=True)
-        sampling_params = SamplingParams(temperature=0,
-                                         max_tokens=256,
-                                         guided_decoding=guided_params)
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=256,
+            structured_outputs=structured_outputs_params)
 
         outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
         assert outputs is not None
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index df4eb33f5d45..11c810386ee2 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2566,24 +2566,24 @@ class PoolerConfig:
     ## for embeddings models
     normalize: Optional[bool] = None
     """
-    Whether to normalize the embeddings outputs. 
+    Whether to normalize the embeddings outputs.
     """
     dimensions: Optional[int] = None
     """
-    Reduce the dimensions of embeddings if model 
+    Reduce the dimensions of embeddings if model
     support matryoshka representation.
     """
 
     ## for classification models
     activation: Optional[bool] = None
     """
-    Whether to apply activation function to the classification outputs. 
+    Whether to apply activation function to the classification outputs.
     """
 
     ## for reward models
     softmax: Optional[bool] = None
     """
-    Whether to apply softmax to the reward outputs. 
+    Whether to apply softmax to the reward outputs.
     """
     step_tag_id: Optional[int] = None
     """
@@ -2946,26 +2946,26 @@ def get_served_model_name(model: str,
     return served_model_name
 
 
-GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines"]
+StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines"]
 
 
 @config
 @dataclass
-class DecodingConfig:
-    """Dataclass which contains the decoding strategy of the engine."""
+class StructuredOutputsConfig:
+    """Dataclass which contains structured outputs config for the engine."""
 
-    backend: GuidedDecodingBackend = "auto"
-    """Which engine will be used for guided decoding (JSON schema / regex etc)
+    backend: StructuredOutputsBackend = "auto"
+    """Which engine will be used for structured outputs (JSON schema / regex etc)
     by default. With "auto", we will make opinionated choices based on request
     contents and what the backend libraries currently support, so the behavior
-    is subject to change in each release."""
+    is subject to change in each release."""  # noqa: E501
 
     disable_fallback: bool = False
     """If `True`, vLLM will not fallback to a different backend on error."""
 
     disable_any_whitespace: bool = False
-    """If `True`, the model will not generate any whitespace during guided
-    decoding. This is only supported for xgrammar and guidance backends."""
+    """If `True`, the model will not generate any whitespace during structured
+    outputs. This is only supported for xgrammar and guidance backends."""
 
     disable_additional_properties: bool = False
     """If `True`, the `guidance` backend will not use `additionalProperties`
@@ -3262,8 +3262,9 @@ class VllmConfig:
     """LoRA configuration."""
     speculative_config: Optional[SpeculativeConfig] = None
     """Speculative decoding configuration."""
-    decoding_config: DecodingConfig = field(default_factory=DecodingConfig)
-    """Decoding configuration."""
+    structured_outputs_config: StructuredOutputsConfig = field(
+        default_factory=StructuredOutputsConfig)
+    """Structured outputs configuration."""
     observability_config: Optional[ObservabilityConfig] = None
     """Observability configuration."""
     quant_config: Optional[QuantizationConfig] = None
@@ -3354,8 +3355,8 @@ def compute_hash(self) -> str:
             vllm_factors.append(self.speculative_config.compute_hash())
         else:
             vllm_factors.append("None")
-        if self.decoding_config:
-            vllm_factors.append(self.decoding_config.compute_hash())
+        if self.structured_outputs_config:
+            vllm_factors.append(self.structured_outputs_config.compute_hash())
         else:
             vllm_factors.append("None")
         if self.observability_config:
@@ -3775,7 +3776,7 @@ def __str__(self):
             f"enforce_eager={self.model_config.enforce_eager}, "
             f"kv_cache_dtype={self.cache_config.cache_dtype}, "
             f"device_config={self.device_config.device}, "
-            f"decoding_config={self.decoding_config!r}, "
+            f"decoding_config={self.structured_outputs_config!r}, "
             f"observability_config={self.observability_config!r}, "
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d74db67bda0d..32911d29f55c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -23,21 +23,20 @@
 import vllm.envs as envs
 from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                          ConfigFormat, ConfigType, ConvertOption,
-                         DecodingConfig, DetailedTraceModules, Device,
-                         DeviceConfig, DistributedExecutorBackend,
-                         GuidedDecodingBackend, HfOverrides, KVEventsConfig,
-                         KVTransferConfig, LoadConfig, LogprobsMode,
-                         LoRAConfig, ModelConfig, ModelDType, ModelImpl,
-                         MultiModalConfig, ObservabilityConfig, ParallelConfig,
-                         PoolerConfig, PrefixCachingHashAlgo, RunnerOption,
-                         SchedulerConfig, SchedulerPolicy, SpeculativeConfig,
+                         DetailedTraceModules, Device, DeviceConfig,
+                         DistributedExecutorBackend, HfOverrides,
+                         KVEventsConfig, KVTransferConfig, LoadConfig,
+                         LogprobsMode, LoRAConfig, ModelConfig, ModelDType,
+                         ModelImpl, MultiModalConfig, ObservabilityConfig,
+                         ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
+                         RunnerOption, SchedulerConfig, SchedulerPolicy,
+                         SpeculativeConfig, StructuredOutputsConfig,
                          TaskOption, TokenizerMode, VllmConfig, get_attr_docs,
                          get_field)
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_ray_initialized
-from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.config import is_interleaved
 from vllm.transformers_utils.utils import check_gguf_file
@@ -382,12 +381,9 @@ class EngineArgs:
     disable_hybrid_kv_cache_manager: bool = (
         SchedulerConfig.disable_hybrid_kv_cache_manager)
 
-    guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend
-    guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback
-    guided_decoding_disable_any_whitespace: bool = \
-        DecodingConfig.disable_any_whitespace
-    guided_decoding_disable_additional_properties: bool = \
-        DecodingConfig.disable_additional_properties
+    structured_outputs_config: StructuredOutputsConfig = get_field(
+        VllmConfig, "structured_outputs_config")
+
     logits_processor_pattern: Optional[
         str] = ModelConfig.logits_processor_pattern
 
@@ -426,7 +422,6 @@ class EngineArgs:
 
     additional_config: dict[str, Any] = \
         get_field(VllmConfig, "additional_config")
-    reasoning_parser: str = DecodingConfig.reasoning_backend
 
     use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
     pt_load_map_location: str = LoadConfig.pt_load_map_location
@@ -567,29 +562,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         load_group.add_argument('--pt-load-map-location',
                                 **load_kwargs["pt_load_map_location"])
 
-        # Guided decoding arguments
-        guided_decoding_kwargs = get_kwargs(DecodingConfig)
-        guided_decoding_group = parser.add_argument_group(
-            title="DecodingConfig",
-            description=DecodingConfig.__doc__,
-        )
-        guided_decoding_group.add_argument("--guided-decoding-backend",
-                                           **guided_decoding_kwargs["backend"])
-        guided_decoding_group.add_argument(
-            "--guided-decoding-disable-fallback",
-            **guided_decoding_kwargs["disable_fallback"])
-        guided_decoding_group.add_argument(
-            "--guided-decoding-disable-any-whitespace",
-            **guided_decoding_kwargs["disable_any_whitespace"])
-        guided_decoding_group.add_argument(
-            "--guided-decoding-disable-additional-properties",
-            **guided_decoding_kwargs["disable_additional_properties"])
-        guided_decoding_group.add_argument(
-            "--reasoning-parser",
-            # This choices is a special case because it's not static
-            choices=list(ReasoningParserManager.reasoning_parsers),
-            **guided_decoding_kwargs["reasoning_backend"])
-
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
         parallel_group = parser.add_argument_group(
@@ -840,6 +812,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                 **vllm_kwargs["compilation_config"])
         vllm_group.add_argument("--additional-config",
                                 **vllm_kwargs["additional_config"])
+        vllm_group.add_argument('--structured-outputs-config',
+                                **vllm_kwargs["structured_outputs_config"])
 
         # Other arguments
         parser.add_argument('--disable-log-stats',
@@ -1328,14 +1302,8 @@ def create_engine_config(
 
         load_config = self.create_load_config()
 
-        decoding_config = DecodingConfig(
-            backend=self.guided_decoding_backend,
-            disable_fallback=self.guided_decoding_disable_fallback,
-            disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
-            disable_additional_properties=\
-                self.guided_decoding_disable_additional_properties,
-            reasoning_backend=self.reasoning_parser
-        )
+        structured_outputs_config = StructuredOutputsConfig(
+            **self.structured_outputs)
 
         observability_config = ObservabilityConfig(
             show_hidden_metrics_for_version=(
@@ -1353,7 +1321,7 @@ def create_engine_config(
             lora_config=lora_config,
             speculative_config=speculative_config,
             load_config=load_config,
-            decoding_config=decoding_config,
+            structured_outputs_config=structured_outputs_config,
             observability_config=observability_config,
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 1f962b008ee0..851962920abe 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -10,8 +10,8 @@
 from weakref import ReferenceType
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, VllmConfig)
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
@@ -1063,10 +1063,6 @@ async def get_parallel_config(self) -> ParallelConfig:
         """Get the parallel configuration of the vLLM engine."""
         return self.engine.get_parallel_config()
 
-    async def get_decoding_config(self) -> DecodingConfig:
-        """Get the decoding configuration of the vLLM engine."""
-        return self.engine.get_decoding_config()
-
     async def get_scheduler_config(self) -> SchedulerConfig:
         """Get the scheduling configuration of the vLLM engine."""
         return self.engine.get_scheduler_config()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3fc4f6445df2..f04ec035030d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -16,9 +16,8 @@
 from typing_extensions import TypeVar
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
+from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
 from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase, Stats
@@ -217,8 +216,6 @@ def __init__(
         self.device_config = vllm_config.device_config
         self.speculative_config = vllm_config.speculative_config  # noqa
         self.load_config = vllm_config.load_config
-        self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
-        )
         self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
         )
 
@@ -814,10 +811,6 @@ def get_parallel_config(self) -> ParallelConfig:
         """Gets the parallel configuration."""
         return self.parallel_config
 
-    def get_decoding_config(self) -> DecodingConfig:
-        """Gets the decoding configuration."""
-        return self.decoding_config
-
     def get_scheduler_config(self) -> SchedulerConfig:
         """Gets the scheduler configuration."""
         return self.scheduler_config
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index f69f72edf6a5..7c3679507686 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -16,7 +16,7 @@
 from zmq.asyncio import Socket
 
 from vllm import PoolingParams
-from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -93,7 +93,6 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
         # Get the configs.
         self.vllm_config = engine_config
         self.model_config = engine_config.model_config
-        self.decoding_config = engine_config.decoding_config
 
         if self.vllm_config.model_config.skip_tokenizer_init:
             self.tokenizer = None
@@ -386,9 +385,6 @@ async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
     async def get_vllm_config(self) -> VllmConfig:
         return self.vllm_config
 
-    async def get_decoding_config(self) -> DecodingConfig:
-        return self.decoding_config
-
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 671e9648a3d0..5984244dd9c0 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -6,7 +6,7 @@
 from typing import AsyncGenerator, Mapping, Optional
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
-from vllm.config import DecodingConfig, ModelConfig, VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
@@ -247,11 +247,6 @@ async def get_model_config(self) -> ModelConfig:
         """Get the model configuration of the vLLM engine."""
         ...
 
-    @abstractmethod
-    async def get_decoding_config(self) -> DecodingConfig:
-        """Get the decoding configuration of the vLLM engine."""
-        ...
-
     @abstractmethod
     async def get_input_preprocessor(self) -> InputPreprocessor:
         """Get the input processor of the vLLM engine."""
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 915f14a29b90..a942532200c6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -17,8 +17,8 @@
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence,
                               create_sort_beams_key_function)
-from vllm.config import (CompilationConfig, ModelDType, TokenizerMode,
-                         is_init_field)
+from vllm.config import (CompilationConfig, ModelDType,
+                         StructuredOutputsConfig, TokenizerMode, is_init_field)
 from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides,
                                    PoolerConfig, RunnerOption)
 from vllm.engine.llm_engine import LLMEngine
@@ -196,6 +196,8 @@ def __init__(
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[dict[str, Any]] = None,
         override_pooler_config: Optional[PoolerConfig] = None,
+        structured_outputs_config: Optional[Union[dict[
+            str, Any], StructuredOutputsConfig]] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
         **kwargs,
@@ -245,6 +247,18 @@ def __init__(
         else:
             compilation_config_instance = CompilationConfig()
 
+        if structured_outputs_config is not None:
+            if isinstance(structured_outputs_config, dict):
+                predicate = lambda x: is_init_field(StructuredOutputsConfig, x[
+                    0])
+                structured_outputs_instance = StructuredOutputsConfig(**dict(
+                    filter(
+                        predicate,
+                        structured_outputs_config.items(),
+                    )))
+            else:
+                structured_outputs_instance = structured_outputs_config
+
         engine_args = EngineArgs(
             model=model,
             runner=runner,
@@ -271,6 +285,7 @@ def __init__(
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
+            structured_outputs_config=structured_outputs_instance,
             compilation_config=compilation_config_instance,
             **kwargs,
         )
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 543701ed144e..abae05386d8a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -35,8 +35,8 @@
                                           ScoreMultiModalParam)
 from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams,
-                                  RequestOutputKind, SamplingParams)
+from vllm.sampling_params import (BeamSearchParams, RequestOutputKind,
+                                  SamplingParams, StructuredOutputsParams)
 from vllm.sequence import Logprob
 from vllm.utils import random_uuid, resolve_obj_by_qualname
 
@@ -335,11 +335,11 @@ def to_sampling_params(
         stop_token_ids = default_sampling_params.get("stop_token_ids")
 
         # Structured output
-        guided_decoding = None
+        structured_outputs = None
         if self.text is not None and self.text.format is not None:
             response_format = self.text.format
             if response_format.type == "json_schema":
-                guided_decoding = GuidedDecodingParams.from_optional(
+                structured_outputs = StructuredOutputsParams.from_optional(
                     json=response_format.schema_)
             elif response_format.type == "json_object":
                 raise NotImplementedError("json_object is not supported")
@@ -353,7 +353,7 @@ def to_sampling_params(
             stop_token_ids=stop_token_ids,
             output_kind=(RequestOutputKind.DELTA
                          if self.stream else RequestOutputKind.FINAL_ONLY),
-            guided_decoding=guided_decoding,
+            structured_outputs=structured_outputs,
         )
 
     @model_validator(mode="before")
@@ -500,42 +500,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
-    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+    structured_outputs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=("If specified, the output will follow the JSON schema."),
-    )
-    guided_regex: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the regex pattern."),
-    )
-    guided_choice: Optional[list[str]] = Field(
-        default=None,
-        description=(
-            "If specified, the output will be exactly one of the choices."),
-    )
-    guided_grammar: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the context free grammar."),
-    )
-    structural_tag: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the structural tag schema."),
-    )
-    guided_decoding_backend: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, will override the default guided decoding backend "
-            "of the server for this specific request. If set, must be either "
-            "'outlines' / 'lm-format-enforcer'"),
-    )
-    guided_whitespace_pattern: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, will override the default whitespace pattern "
-            "for guided json decoding."),
+        description="Additional kwargs for structured outputs",
     )
     priority: int = Field(
         default=0,
@@ -646,30 +613,29 @@ def to_sampling_params(
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        guided_json_object = None
+        structured_outputs = StructuredOutputsParams(**self.structured_outputs)
         if self.response_format is not None:
             if self.response_format.type == "json_object":
-                guided_json_object = True
+                structured_outputs.json_object = True
             elif self.response_format.type == "json_schema":
                 json_schema = self.response_format.json_schema
                 assert json_schema is not None
-                self.guided_json = json_schema.json_schema
+                structured_outputs.json = json_schema.json_schema
             elif self.response_format.type == "structural_tag":
                 structural_tag = self.response_format
                 assert structural_tag is not None and isinstance(
                     structural_tag, StructuralTagResponseFormat)
                 s_tag_obj = structural_tag.model_dump(by_alias=True)
-                self.structural_tag = json.dumps(s_tag_obj)
-
-        guided_decoding = GuidedDecodingParams.from_optional(
-            json=self._get_guided_json_from_tool() or self.guided_json,
-            regex=self.guided_regex,
-            choice=self.guided_choice,
-            grammar=self.guided_grammar,
-            json_object=guided_json_object,
-            backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern,
-            structural_tag=self.structural_tag,
+                structured_outputs.structural_tag = json.dumps(s_tag_obj)
+
+        structured_outputs = StructuredOutputsParams.from_optional(
+            json=self._get_json_schema_from_tool() or structured_outputs.json,
+            regex=structured_outputs.regex,
+            choice=structured_outputs.choice,
+            grammar=structured_outputs.grammar,
+            json_object=structured_outputs.json_object,
+            whitespace_pattern=structured_outputs.whitespace_pattern,
+            structural_tag=structured_outputs.structural_tag,
         )
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
@@ -702,14 +668,14 @@ def to_sampling_params(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
-            guided_decoding=guided_decoding,
+            structured_outputs=structured_outputs,
             logit_bias=self.logit_bias,
             bad_words= self.bad_words,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
         )
 
-    def _get_guided_json_from_tool(
+    def _get_json_schema_from_tool(
             self) -> Optional[Union[str, dict, BaseModel]]:
         # user has chosen to not use any tool
         if self.tool_choice == "none" or self.tools is None:
@@ -816,28 +782,37 @@ def check_logprobs(cls, data):
 
     @model_validator(mode="before")
     @classmethod
-    def check_guided_decoding_count(cls, data):
+    def check_structured_outputs_count(cls, data):
         if isinstance(data, ValueError):
             raise data
 
-        guide_count = sum([
-            "guided_json" in data and data["guided_json"] is not None,
-            "guided_regex" in data and data["guided_regex"] is not None,
-            "guided_choice" in data and data["guided_choice"] is not None
+        if "structured_outputs" not in data:
+            return data
+
+        structured_outputs_kwargs = data['structured_outputs']
+
+        count = sum([
+            "json" in structured_outputs_kwargs
+            and structured_outputs_kwargs["json"] is not None,
+            "regex" in structured_outputs_kwargs
+            and structured_outputs_kwargs["regex"] is not None,
+            "choice" in structured_outputs_kwargs
+            and structured_outputs_kwargs["choice"] is not None
         ])
-        # you can only use one kind of guided decoding
-        if guide_count > 1:
+        # you can only use one kind of constraints for structured outputs
+        if count > 1:
             raise ValueError(
-                "You can only use one kind of guided decoding "
-                "('guided_json', 'guided_regex' or 'guided_choice').")
-        # you can only either use guided decoding or tools, not both
-        if guide_count > 1 and data.get("tool_choice", "none") not in (
+                "You can only use one kind of constraints for structured outputs ('json', 'regex' or 'choice')."  # noqa: E501
+            )
+        # you can only either use structured outputs or tools, not both
+        if count > 1 and data.get("tool_choice", "none") not in (
                 "none",
                 "auto",
                 "required",
         ):
             raise ValueError(
-                "You can only either use guided decoding or tools, not both.")
+                "You can only either use constraints for structured outputs or tools, not both."  # noqa: E501
+            )
         return data
 
     @model_validator(mode="before")
@@ -990,37 +965,9 @@ class CompletionRequest(OpenAIBaseModel):
             ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
         ),
     )
-    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
-        default=None,
-        description="If specified, the output will follow the JSON schema.",
-    )
-    guided_regex: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, the output will follow the regex pattern."),
-    )
-    guided_choice: Optional[list[str]] = Field(
-        default=None,
-        description=(
-            "If specified, the output will be exactly one of the choices."),
-    )
-    guided_grammar: Optional[str] = Field(
+    structured_outputs: Optional[dict[str, Any]] = Field(
         default=None,
-        description=(
-            "If specified, the output will follow the context free grammar."),
-    )
-    guided_decoding_backend: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, will override the default guided decoding backend "
-            "of the server for this specific request. If set, must be one of "
-            "'outlines' / 'lm-format-enforcer'"),
-    )
-    guided_whitespace_pattern: Optional[str] = Field(
-        default=None,
-        description=(
-            "If specified, will override the default whitespace pattern "
-            "for guided json decoding."),
+        description="Additional kwargs for structured outputs",
     )
     priority: int = Field(
         default=0,
@@ -1143,19 +1090,19 @@ def to_sampling_params(
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        guided_json_object = None
+        structured_outputs_kwargs = StructuredOutputsParams(
+            **self.structured_outputs)
         if (self.response_format is not None
                 and self.response_format.type == "json_object"):
-            guided_json_object = True
-
-        guided_decoding = GuidedDecodingParams.from_optional(
-            json=self.guided_json,
-            regex=self.guided_regex,
-            choice=self.guided_choice,
-            grammar=self.guided_grammar,
-            json_object=guided_json_object,
-            backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern,
+            structured_outputs_kwargs.json_object = True
+
+        structured_outputs = StructuredOutputsParams.from_optional(
+            json=structured_outputs_kwargs.json,
+            regex=structured_outputs_kwargs.regex,
+            choice=structured_outputs_kwargs.choice,
+            grammar=structured_outputs_kwargs.grammar,
+            json_object=structured_outputs_kwargs.json_object,
+            whitespace_pattern=structured_outputs_kwargs.whitespace_pattern,
         )
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
@@ -1188,7 +1135,7 @@ def to_sampling_params(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
-            guided_decoding=guided_decoding,
+            structured_outputs=structured_outputs,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
@@ -1196,16 +1143,23 @@ def to_sampling_params(
 
     @model_validator(mode="before")
     @classmethod
-    def check_guided_decoding_count(cls, data):
-        guide_count = sum([
-            "guided_json" in data and data["guided_json"] is not None,
-            "guided_regex" in data and data["guided_regex"] is not None,
-            "guided_choice" in data and data["guided_choice"] is not None
+    def check_structured_outputs_count(cls, data):
+        if "structured_outputs" not in data:
+            return data
+
+        structured_outputs_kwargs = data['structured_outputs']
+        count = sum([
+            "json" in structured_outputs_kwargs
+            and structured_outputs_kwargs["json"] is not None,
+            "regex" in structured_outputs_kwargs
+            and structured_outputs_kwargs["regex"] is not None,
+            "choice" in structured_outputs_kwargs
+            and structured_outputs_kwargs["choice"] is not None
         ])
-        if guide_count > 1:
+        if count > 1:
             raise ValueError(
-                "You can only use one kind of guided decoding "
-                "('guided_json', 'guided_regex' or 'guided_choice').")
+                "You can only use one kind of constraints for structured outputs ('json', 'regex' or 'choice')."  # noqa: E501
+            )
         return data
 
     @model_validator(mode="before")
@@ -1991,7 +1945,7 @@ class DetokenizeResponse(OpenAIBaseModel):
 
 class TokenizerInfoResponse(OpenAIBaseModel):
     """
-    Response containing tokenizer configuration 
+    Response containing tokenizer configuration
     equivalent to tokenizer_config.json
     """
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 6f21cd267b0e..4bfe9094edff 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -251,9 +251,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
 
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        decoding_config = vllm_config.decoding_config
-        if decoding_config.reasoning_backend == "":
-            decoding_config.reasoning_backend = "GptOss"
+        structured_outputs_config = vllm_config.structured_outputs_config
+        if structured_outputs_config.reasoning_backend == "":
+            structured_outputs_config.reasoning_backend = "GptOss"
 
         # Increase the max capture size from 512 to 1024 for performance.
         # NOTE(woosuk): This will increase the number of CUDA graphs
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index df4cca9ba114..632bf05372a9 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -28,7 +28,7 @@ class SamplingType(IntEnum):
 
 # maybe make msgspec?
 @dataclass
-class GuidedDecodingParams:
+class StructuredOutputsParams:
     """One of these fields will be used to build a logit processor."""
     json: Optional[Union[str, dict]] = None
     regex: Optional[str] = None
@@ -36,8 +36,6 @@ class GuidedDecodingParams:
     grammar: Optional[str] = None
     json_object: Optional[bool] = None
     """These are other options that can be set"""
-    backend: Optional[str] = None
-    backend_was_auto: bool = False
     disable_fallback: bool = False
     disable_any_whitespace: bool = False
     disable_additional_properties: bool = False
@@ -51,37 +49,35 @@ def from_optional(
         choice: Optional[list[str]] = None,
         grammar: Optional[str] = None,
         json_object: Optional[bool] = None,
-        backend: Optional[str] = None,
         whitespace_pattern: Optional[str] = None,
         structural_tag: Optional[str] = None,
-    ) -> Optional["GuidedDecodingParams"]:
+    ) -> Optional["StructuredOutputsParams"]:
         if all(arg is None for arg in (json, regex, choice, grammar,
                                        json_object, structural_tag)):
             return None
         # Extract json schemas from pydantic models
         if isinstance(json, (BaseModel, type(BaseModel))):
             json = json.model_json_schema()
-        return GuidedDecodingParams(
+        return StructuredOutputsParams(
             json=json,
             regex=regex,
             choice=choice,
             grammar=grammar,
             json_object=json_object,
-            backend=backend,
             whitespace_pattern=whitespace_pattern,
             structural_tag=structural_tag,
         )
 
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
-        guide_count = sum([
+        count = sum([
             self.json is not None, self.regex is not None, self.choice
             is not None, self.grammar is not None, self.json_object is not None
         ])
-        if guide_count > 1:
+        if count > 1:
             raise ValueError(
-                "You can only use one kind of guided decoding but multiple are "
-                f"specified: {self.__dict__}")
+                f"You can only use one kind of structured outputs constraint but multiple are specified: {self.__dict__}"  # noqa: E501
+            )
 
 
 class RequestOutputKind(Enum):
@@ -194,9 +190,7 @@ class SamplingParams(
     _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
 
     # Fields used to construct logits processors
-    guided_decoding: Optional[GuidedDecodingParams] = None
-    """If provided, the engine will construct a guided decoding logits
-    processor from these parameters."""
+    structured_outputs: Optional[StructuredOutputsParams] = None
     logit_bias: Optional[dict[int, float]] = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""
@@ -243,7 +237,7 @@ def from_optional(
         truncate_prompt_tokens: Optional[Annotated[int,
                                                    msgspec.Meta(ge=1)]] = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
-        guided_decoding: Optional[GuidedDecodingParams] = None,
+        structured_outputs: Optional[StructuredOutputsParams] = None,
         logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
         allowed_token_ids: Optional[list[int]] = None,
         extra_args: Optional[dict[str, Any]] = None,
@@ -285,7 +279,7 @@ def from_optional(
             logits_processors=logits_processors,
             truncate_prompt_tokens=truncate_prompt_tokens,
             output_kind=output_kind,
-            guided_decoding=guided_decoding,
+            structured_outputs=structured_outputs,
             logit_bias=logit_bias,
             allowed_token_ids=allowed_token_ids,
             extra_args=extra_args,
@@ -552,7 +546,7 @@ def __repr__(self) -> str:
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
             f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
-            f"guided_decoding={self.guided_decoding}, "
+            f"structured_outputs={self.structured_outputs}, "
             f"extra_args={self.extra_args})")
 
 
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 6ccc636efaf1..99237fb96567 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -463,9 +463,6 @@ def _token_to_id(t: str):
 
         return decoded
 
-    # WARN: Outlines logits processors can overwrite this method.
-    # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer
-    # for more.
     def decode(self,
                ids: Union[list[int], int],
                skip_special_tokens: bool = True) -> str:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a2706327914c..78629a13dc35 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -526,9 +526,6 @@ async def get_vllm_config(self) -> VllmConfig:
     async def get_model_config(self) -> ModelConfig:
         return self.model_config
 
-    async def get_decoding_config(self):
-        raise ValueError("Not Supported on V1 yet.")
-
     async def get_input_preprocessor(self) -> InputPreprocessor:
         return self.processor.input_preprocessor
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index b9419142caf6..138773e41966 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -41,7 +41,7 @@ def __init__(
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.lora_config = vllm_config.lora_config
-        self.decoding_config = vllm_config.decoding_config
+        self.structured_outputs_config = vllm_config.structured_outputs_config
         self.tokenizer = tokenizer
 
         self.generation_config_fields = (
@@ -154,40 +154,23 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
                              "not enabled!")
 
     def _validate_structured_output(self, params: SamplingParams) -> None:
-        if not params.guided_decoding or not self.decoding_config:
+        if not params.structured_outputs or not self.structured_outputs_config:
             return
 
-        if self.model_config.skip_tokenizer_init and params.guided_decoding:
+        if self.model_config.skip_tokenizer_init and params.structured_outputs:
             raise ValueError(
                 "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
             )
 
-        engine_level_backend = self.decoding_config.backend
-        if params.guided_decoding.backend:
-            # Request-level backend selection is not supported in V1.
-            # The values may differ if `params` is reused and was set
-            # to a specific backend based on `auto` behavior in a previous
-            # request. We remember that it was set as a result of `auto`
-            # using the `_auto` option set on the backend in the params.
-            if (params.guided_decoding.backend != engine_level_backend
-                    and not (engine_level_backend == "auto"
-                             and params.guided_decoding.backend_was_auto)):
-                raise ValueError(
-                    "Request-level structured output backend selection is no "
-                    "longer supported. The request specified "
-                    f"'{params.guided_decoding.backend}', but vLLM was "
-                    f"initialised with '{engine_level_backend}'. This error "
-                    "can be resolved by removing backend selection from the "
-                    "request.")
-        else:
-            params.guided_decoding.backend = engine_level_backend
+        engine_level_backend = self.structured_outputs_config.backend
 
         # Request content validation
-        if (isinstance(params.guided_decoding.choice, list)
-                and not params.guided_decoding.choice):
+        if (isinstance(params.structured_outputs.choice, list)
+                and not params.structured_outputs.choice):
             # It is invalid for choice to be an empty list
-            raise ValueError(f"Choice '{params.guided_decoding.choice}' "
-                             "cannot be an empty list")
+            raise ValueError(
+                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
+            )
 
         if engine_level_backend.startswith("xgrammar"):
             # xgrammar with no fallback
@@ -210,15 +193,11 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             # between releases as feature support changes.
             try:
                 validate_xgrammar_grammar(params)
-                params.guided_decoding.backend = "xgrammar"
             except ValueError:
                 # The request either failed validation
                 # or includes some jsonschema feature(s) that
                 # are not supported in xgrammar. Fall back to guidance.
                 validate_guidance_grammar(params, tokenizer=None)
-                params.guided_decoding.backend = "guidance"
-            # Remember that this backend was set automatically
-            params.guided_decoding.backend_was_auto = True
 
     def process_inputs(
         self,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 85f5dcb92eb4..3f08c02bea24 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -50,7 +50,7 @@ def __init__(
             time.time()
 
         self.status = RequestStatus.WAITING
-        if sampling_params and sampling_params.guided_decoding is not None:
+        if sampling_params and sampling_params.structured_outputs is not None:
             self.status = RequestStatus.WAITING_FOR_FSM
         self.events: list[EngineCoreEvent] = []
         self.stop_reason: Union[int, str, None] = None
@@ -63,7 +63,7 @@ def __init__(
         elif sampling_params is not None:
             assert sampling_params.max_tokens is not None
             self.max_tokens = sampling_params.max_tokens
-            if sampling_params.guided_decoding is not None:
+            if sampling_params.structured_outputs is not None:
                 self.status = RequestStatus.WAITING_FOR_FSM
 
             if sampling_params.extra_args is not None:
@@ -175,7 +175,7 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
     @property
     def use_structured_output(self) -> bool:
         return self.sampling_params is not None and \
-            self.sampling_params.guided_decoding is not None
+            self.sampling_params.structured_outputs is not None
 
     def record_event(
         self,
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 63604a335d9f..4dccd1fe46bf 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -65,7 +65,7 @@ def __init__(self, vllm_config: VllmConfig):
                 lora_config=self.vllm_config.lora_config,
             ).get_lora_tokenizer(None)
             reasoning_backend = \
-                    self.vllm_config.decoding_config.reasoning_backend
+                    self.vllm_config.structured_outputs_config.reasoning_backend
             if reasoning_backend:
                 reasoner_cls = ReasoningParserManager.get_reasoning_parser(
                     reasoning_backend)
@@ -77,7 +77,7 @@ def grammar_init(self, request: Request) -> None:
 
         if TYPE_CHECKING:
             assert request.sampling_params is not None and \
-                request.sampling_params.guided_decoding is not None
+                request.sampling_params.structured_outputs is not None
 
         # Initialize the backend the first time it is needed.
         #
@@ -85,7 +85,7 @@ def grammar_init(self, request: Request) -> None:
         # backends on a per-request basis in V1 (for now, anyway...).
         if self.backend is None:
             assert request.sampling_params is not None
-            backend = request.sampling_params.guided_decoding.backend
+            backend = self.vllm_config.structured_outputs_config.backend
             vocab_size = self.vllm_config.model_config.get_vocab_size()
             if backend == "xgrammar":
                 self.backend = XgrammarBackend(
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 02e7fc33f517..e06ab6377de3 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend):
 
     def __post_init__(self):
         self.disable_any_whitespace = \
-            self.vllm_config.decoding_config.disable_any_whitespace
+            self.vllm_config.structured_outputs_config.disable_any_whitespace
         self.disable_additional_properties = \
-            self.vllm_config.decoding_config.disable_additional_properties
+            self.vllm_config.structured_outputs_config.disable_additional_properties
 
         self.ll_tokenizer = llguidance_hf.from_tokenizer(
             self.tokenizer, self.vocab_size)
diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py
index 572e4984480f..4ea859b305dc 100644
--- a/vllm/v1/structured_output/backend_outlines.py
+++ b/vllm/v1/structured_output/backend_outlines.py
@@ -158,10 +158,10 @@ def reset(self):
 
 
 def validate_structured_output_request_outlines(params: SamplingParams):
-    if params.guided_decoding is None:
+    if params.structured_outputs is None:
         return
 
-    gd_params = params.guided_decoding
+    gd_params = params.structured_outputs
 
     if gd_params.regex:
         validate_regex_is_buildable(gd_params.regex)
@@ -178,7 +178,7 @@ def validate_structured_output_request_outlines(params: SamplingParams):
                 schema = json.dumps(gd_params.json)
             except Exception as e:
                 raise ValueError(
-                    f"Error serializing guided decoding jsonschema: {e}"
+                    f"Error serializing structured outputs jsonschema: {e}"
                 ) from e
         pattern = json_schema.build_regex_from_schema(schema)
         validate_regex_is_buildable(pattern)
@@ -187,7 +187,7 @@ def validate_structured_output_request_outlines(params: SamplingParams):
         regex = "(" + "|".join(choices) + ")"
         validate_regex_is_buildable(regex)
     elif gd_params.grammar:
-        raise ValueError("Outlines guided decoding backend "
+        raise ValueError("Outlines structured outputs backend "
                          "does not support grammar specifications")
 
 
@@ -306,7 +306,7 @@ def validate_regex_is_buildable(pattern: str) -> None:
         _check_unsupported(parsed)
     except ValueError as e:
         raise ValueError(
-            f"Regex uses unsupported feature for guided decoding: {e}. "
+            f"Regex uses unsupported feature for structured outputs: {e}. "
             "Only basic matching constructs are supported—lookarounds, "
             "backreferences, and unicode boundaries are not.") from e
 
@@ -315,6 +315,6 @@ def validate_regex_is_buildable(pattern: str) -> None:
             "Regex does not have a anchored universal start state"
             "This means that the Regex uses anchors (^) or look-arounds "
             "in a way which requires context before any token is matched."
-            "Guided decoding needs regexes that can match without needing "
+            "structured outputs needs regexes that can match without needing "
             "that context. Try rewriting the pattern without using these "
             f"constructs. Pattern:\n{pattern}")
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 5e00f6380416..edea1fd5fc8e 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend):
 
     def __post_init__(self):
         self.disable_any_whitespace = \
-            self.vllm_config.decoding_config.disable_any_whitespace
+            self.vllm_config.structured_outputs_config.disable_any_whitespace
 
         if isinstance(self.tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.
@@ -248,10 +248,10 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
 
     Raises ValueError if the request is not supported.
     """
-    if sampling_params.guided_decoding is None:
+    if sampling_params.structured_outputs is None:
         return
 
-    gd_params = sampling_params.guided_decoding
+    gd_params = sampling_params.structured_outputs
 
     if gd_params.regex:
         try:
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index fc365f12573f..99974ef46ecd 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -60,7 +60,7 @@ def structured_output_key(self) -> StructuredOutputKey:
 
 def get_structured_output_key(
         sampling_params: SamplingParams) -> StructuredOutputKey:
-    params = sampling_params.guided_decoding
+    params = sampling_params.structured_outputs
     assert params is not None, "params can't be None."
     if params.json is not None:
         if not isinstance(params.json, str):

From 47ef968e5f21ccc4858840a02403f495985ac37e Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Tue, 12 Aug 2025 20:48:20 -0400
Subject: [PATCH 02/43] fix: remove unecessary frontmatter

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 docs/features/reasoning_outputs.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 3c66f4bd57df..c0a1c784686b 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -1,7 +1,3 @@
----
-title: reasoning_outputs
----
-
 # Reasoning Outputs
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.

From f5d594c919f9f8afe94b0242c04f5109bc87b1d5 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Tue, 12 Aug 2025 21:11:24 -0400
Subject: [PATCH 03/43] fix: tests to use correct CLI args

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
---
 .../entrypoints/openai/test_completion_with_function_calling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index a5b081f86107..b2c3386b320b 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -21,7 +21,7 @@ def server():  # noqa: F811
         "--dtype",
         "half",
         "--enable-auto-tool-choice",
-        "--guided-decoding-backend",
+        "--structured-outputs-config.backend",
         "xgrammar",
         "--tool-call-parser",
         "hermes",

From bb884cdc8c9d041aca2b1f06a4a36a84ebfc4470 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 1 Sep 2025 17:07:16 +0200
Subject: [PATCH 04/43] Sweep for `guided_{choice/regex/json/grammar}`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/structured_outputs.md           |  2 +-
 docs/serving/openai_compatible_server.md      |  2 +-
 tests/entrypoints/openai/test_chat.py         | 28 ++++++++++---------
 .../entrypoints/openai/test_openai_schema.py  |  8 ++++--
 .../openai/test_chat_completion.py            | 14 ++++++++--
 .../v1/entrypoints/openai/test_completion.py  | 14 ++++++++--
 6 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 60da6cc9a7d1..1f955c6e30d6 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -45,7 +45,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as
         messages=[
             {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
         ],
-        extra_body={"structured_outputs": {"choices": ["positive", "negative"]}},
+        extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
     )
     print(completion.choices[0].message.content)
     ```
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index ec4a1a7004a3..56eb3c515c86 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -133,7 +133,7 @@ completion = client.chat.completions.create(
         {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
     ],
     extra_body={
-        "structured_outputs": {"choices": ["positive", "negative"]}
+        "structured_outputs": {"choice": ["positive", "negative"]}
     }
 )
 ```
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 2ccb0beb7709..4cb1eda87e0f 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -505,7 +505,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_choices))
+        extra_body=dict(structured_outputs={"choice": sample_choices}))
     choice1 = chat_completion.choices[0].message.content
     assert choice1 in sample_choices
 
@@ -519,7 +519,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_choices))
+        extra_body=dict(structured_outputs={"choice": sample_choices}))
     choice2 = chat_completion.choices[0].message.content
     assert choice2 in sample_choices
     assert choice1 != choice2
@@ -545,7 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema))
+        extra_body=dict(structured_outputs={"json": sample_json_schema}))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
@@ -562,7 +562,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema))
+        extra_body=dict(structured_outputs={"json": sample_json_schema}))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
@@ -590,7 +590,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex))
+        extra_body=dict(structured_outputs={"regex": sample_regex}))
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
     assert re.fullmatch(sample_regex, ip1) is not None
@@ -601,7 +601,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex))
+        extra_body=dict(structured_outputs={"regex": sample_regex}))
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
     assert re.fullmatch(sample_regex, ip2) is not None
@@ -621,12 +621,14 @@ async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
     }]
 
     with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(model=MODEL_NAME,
-                                                 messages=messages,
-                                                 extra_body=dict(guided_regex={
-                                                     1: "Python",
-                                                     2: "C++"
-                                                 }))
+        _ = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            extra_body=dict(
+                structured_outputs={"regex": {
+                    1: "Python",
+                    2: "C++"
+                }}))
 
 
 @pytest.mark.asyncio
@@ -648,7 +650,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=sample_choices))
+        extra_body=dict(structured_outputs={"choice": sample_choices}))
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 11ed1c4a9ee4..3787c1001f9a 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -102,12 +102,14 @@ def no_invalid_types(case: schemathesis.models.Case):
                                 if "custom" in tool_call:
                                     return False
 
-            # Sometimes guided_grammar is generated to be empty
+            # Sometimes structured_outputs.grammar is generated to be empty
             # Causing a server error in EBNF grammar parsing
             # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
-            guided_grammar = case.body.get("guided_grammar")
+            structured_outputs = case.body.get("structured_outputs", {})
+            g = structured_outputs.get("grammar") if isinstance(
+                structured_outputs, dict) else None
 
-            if guided_grammar == '':
+            if g == '':
                 # Allow None (will be handled as no grammar)
                 # But skip empty strings
                 return False
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py
index dffb32846c05..9aa285aa9b18 100644
--- a/tests/v1/entrypoints/openai/test_chat_completion.py
+++ b/tests/v1/entrypoints/openai/test_chat_completion.py
@@ -77,7 +77,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
                 "role": "user",
                 "content": prompt,
             }],
-            extra_body={"guided_json": invalid_json_schema},
+            extra_body={"structured_outputs": {
+                "json": invalid_json_schema
+            }},
         )
 
 
@@ -99,7 +101,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
                 "content": prompt,
             }],
             extra_body={
-                "guided_regex": r"[.*",
+                "structured_outputs": {
+                    "regex": r"[.*"
+                },
                 "stop": ["\n"]
             },
         )
@@ -134,5 +138,9 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
                 "role": "user",
                 "content": prompt,
             }],
-            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
+            extra_body={
+                "structured_outputs": {
+                    "grammar": invalid_simplified_sql_grammar
+                }
+            },
         )
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 3a65583fab8d..afbda20a14c9 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -627,7 +627,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI,
         await client.completions.create(
             model=model_name,
             prompt=prompt,
-            extra_body={"guided_json": invalid_json_schema},
+            extra_body={"structured_outputs": {
+                "json": invalid_json_schema
+            }},
         )
 
 
@@ -646,7 +648,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str):
             model=model_name,
             prompt=prompt,
             extra_body={
-                "guided_regex": r"[.*",
+                "structured_outputs": {
+                    "regex": r"[.*"
+                },
                 "stop": ["\n"]
             },
         )
@@ -678,7 +682,11 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
         await client.completions.create(
             model=model_name,
             prompt=prompt,
-            extra_body={"guided_grammar": invalid_simplified_sql_grammar},
+            extra_body={
+                "structured_outputs": {
+                    "grammar": invalid_simplified_sql_grammar
+                }
+            },
         )
 
 

From f2cd9e09fea9ebe728c56de1d9d4dedd36d75dea Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 1 Sep 2025 17:08:34 +0200
Subject: [PATCH 05/43] `gd_params` -> `so_params`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../backend_lm_format_enforcer.py             | 16 ++++-----
 vllm/v1/structured_output/backend_outlines.py | 22 ++++++------
 vllm/v1/structured_output/backend_xgrammar.py | 34 +++++++++----------
 3 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py
index 2279a1c8c8a0..dbc2a59332ef 100644
--- a/vllm/v1/structured_output/backend_lm_format_enforcer.py
+++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py
@@ -141,27 +141,27 @@ def validate_structured_output_request_lm_format_enforcer(
     if params.guided_decoding is None:
         return
 
-    gd_params = params.guided_decoding
+    so_params = params.guided_decoding
 
-    if gd_params.regex:
+    if so_params.regex:
         return
-    elif gd_params.json:
-        if isinstance(gd_params.json, str):
+    elif so_params.json:
+        if isinstance(so_params.json, str):
             try:
                 # make sure schema is valid json
-                json.loads(gd_params.json)
+                json.loads(so_params.json)
             except json.JSONDecodeError as e:
                 raise ValueError("Invalid JSON grammar specification.") from e
         else:
             try:
-                json.dumps(gd_params.json)
+                json.dumps(so_params.json)
             except Exception as e:
                 raise ValueError(
                     f"Error serializing guided decoding jsonschema: {e}"
                 ) from e
         return
-    elif gd_params.choice:
+    elif so_params.choice:
         return
-    elif gd_params.grammar:
+    elif so_params.grammar:
         raise ValueError("LM Format Enforcer guided decoding backend "
                          "does not support grammar specifications")
diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py
index 4ea859b305dc..e5e638a6ad76 100644
--- a/vllm/v1/structured_output/backend_outlines.py
+++ b/vllm/v1/structured_output/backend_outlines.py
@@ -161,32 +161,32 @@ def validate_structured_output_request_outlines(params: SamplingParams):
     if params.structured_outputs is None:
         return
 
-    gd_params = params.structured_outputs
+    so_params = params.structured_outputs
 
-    if gd_params.regex:
-        validate_regex_is_buildable(gd_params.regex)
-    elif gd_params.json:
-        if isinstance(gd_params.json, str):
+    if so_params.regex:
+        validate_regex_is_buildable(so_params.regex)
+    elif so_params.json:
+        if isinstance(so_params.json, str):
             try:
                 # make sure schema is valid json
-                json.loads(gd_params.json)
-                schema = gd_params.json
+                json.loads(so_params.json)
+                schema = so_params.json
             except json.JSONDecodeError as e:
                 raise ValueError("Invalid JSON grammar specification.") from e
         else:
             try:
-                schema = json.dumps(gd_params.json)
+                schema = json.dumps(so_params.json)
             except Exception as e:
                 raise ValueError(
                     f"Error serializing structured outputs jsonschema: {e}"
                 ) from e
         pattern = json_schema.build_regex_from_schema(schema)
         validate_regex_is_buildable(pattern)
-    elif gd_params.choice:
-        choices = [regex_escape(str(choice)) for choice in gd_params.choice]
+    elif so_params.choice:
+        choices = [regex_escape(str(choice)) for choice in so_params.choice]
         regex = "(" + "|".join(choices) + ")"
         validate_regex_is_buildable(regex)
-    elif gd_params.grammar:
+    elif so_params.grammar:
         raise ValueError("Outlines structured outputs backend "
                          "does not support grammar specifications")
 
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index edea1fd5fc8e..55b4792fe010 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -251,34 +251,34 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
     if sampling_params.structured_outputs is None:
         return
 
-    gd_params = sampling_params.structured_outputs
+    so_params = sampling_params.structured_outputs
 
-    if gd_params.regex:
+    if so_params.regex:
         try:
-            xgr.Grammar.from_regex(gd_params.regex)
+            xgr.Grammar.from_regex(so_params.regex)
         except Exception as err:
             raise ValueError("Failed to transform regex into a grammar: "
                              f"{err}") from err
 
-    if gd_params.choice:
-        choice_grammar = choice_as_grammar(gd_params.choice)
+    if so_params.choice:
+        choice_grammar = choice_as_grammar(so_params.choice)
         try:
             xgr.Grammar.from_ebnf(choice_grammar)
         except Exception as err:
             raise ValueError("Failed to transform choices into a grammar: "
                              "{err}") from err
-        gd_params.choice = None
-        gd_params.grammar = choice_grammar
+        so_params.choice = None
+        so_params.grammar = choice_grammar
         return
 
-    if gd_params.json:
-        if isinstance(gd_params.json, str):
+    if so_params.json:
+        if isinstance(so_params.json, str):
             try:
-                schema = json.loads(gd_params.json)
+                schema = json.loads(so_params.json)
             except json.JSONDecodeError as e:
                 raise ValueError("Invalid JSON grammar specification.") from e
         else:
-            schema = gd_params.json
+            schema = so_params.json
 
         try:
             xgr.Grammar.from_json_schema(schema)
@@ -291,11 +291,11 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
                              "supported by xgrammar.")
         return
 
-    if gd_params.grammar:
-        if grammar_is_likely_lark(gd_params.grammar):
+    if so_params.grammar:
+        if grammar_is_likely_lark(so_params.grammar):
             # xgrammar supports EBNF grammars only
             try:
-                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+                so_params.grammar = convert_lark_to_ebnf(so_params.grammar)
             except ValueError as e:
                 raise ValueError(
                     "Failed to convert the grammar from Lark to EBNF. ") from e
@@ -303,14 +303,14 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None:
         # Test parsing EBNF grammar, possibly already converted from Lark
         try:
             # parse the grammar, but we aren't compiling it.
-            xgr.Grammar.from_ebnf(gd_params.grammar)
+            xgr.Grammar.from_ebnf(so_params.grammar)
         except Exception as e:
             raise ValueError("Invalid grammar specification.") from e
         return
 
-    if gd_params.structural_tag:
+    if so_params.structural_tag:
         try:
-            s_tag = json.loads(gd_params.structural_tag)
+            s_tag = json.loads(so_params.structural_tag)
             tags = [
                 xgr.StructuralTagItem(
                     begin=s["begin"],

From 8f37583a8780346cc87a37015940ccfc9d8b9f43 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 1 Sep 2025 17:11:17 +0200
Subject: [PATCH 06/43] `g` -> `grammar`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/entrypoints/openai/test_openai_schema.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 3787c1001f9a..73f79ac28d11 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -106,10 +106,10 @@ def no_invalid_types(case: schemathesis.models.Case):
             # Causing a server error in EBNF grammar parsing
             # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
             structured_outputs = case.body.get("structured_outputs", {})
-            g = structured_outputs.get("grammar") if isinstance(
+            grammar = structured_outputs.get("grammar") if isinstance(
                 structured_outputs, dict) else None
 
-            if g == '':
+            if grammar == '':
                 # Allow None (will be handled as no grammar)
                 # But skip empty strings
                 return False

From 6e972b81ba1985a0d64a99539e6c01b4972aadf7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 1 Sep 2025 17:16:13 +0200
Subject: [PATCH 07/43] Fix `config/__init__.py`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/config/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 4fae2e2ef3a7..f89e0985bc8a 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3064,10 +3064,10 @@ class StructuredOutputsConfig:
     """Dataclass which contains structured outputs config for the engine."""
 
     backend: StructuredOutputsBackend = "auto"
-    """Which engine will be used for structured outputs (JSON schema / regex etc)
-    by default. With "auto", we will make opinionated choices based on request
-    contents and what the backend libraries currently support, so the behavior
-    is subject to change in each release."""  # noqa: E501
+    """Which engine will be used for structured outputs (e.g. JSON schema,
+    regex, etc) by default. With "auto", we will make opinionated choices
+    based on request contents and what the backend libraries currently support,
+    so the behavior is subject to change in each release."""
 
     disable_fallback: bool = False
     """If `True`, vLLM will not fallback to a different backend on error."""
@@ -3915,7 +3915,7 @@ def __str__(self):
             f"enforce_eager={self.model_config.enforce_eager}, "
             f"kv_cache_dtype={self.cache_config.cache_dtype}, "
             f"device_config={self.device_config.device}, "
-            f"decoding_config={self.structured_outputs_config!r}, "
+            f"structured_outputs_config={self.structured_outputs_config!r}, "
             f"observability_config={self.observability_config!r}, "
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "

From 8810d842c750540912e3af1a226221e4a8cde7e0 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 1 Sep 2025 17:19:25 +0200
Subject: [PATCH 08/43] `engine_level_backend` -> `backend`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/engine/processor.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index ace04f17cc91..5906cd601591 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -207,7 +207,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
                 "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
             )
 
-        engine_level_backend = self.structured_outputs_config.backend
+        backend = self.structured_outputs_config.backend
 
         # Request content validation
         if (isinstance(params.structured_outputs.choice, list)
@@ -217,23 +217,23 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
                 f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
             )
 
-        if engine_level_backend.startswith("xgrammar"):
+        if backend.startswith("xgrammar"):
             # xgrammar with no fallback
             validate_xgrammar_grammar(params)
-        elif engine_level_backend.startswith("guidance"):
+        elif backend.startswith("guidance"):
             # TODO: ideally we would have the LLTokenizer here as Lark syntax
             # allows <|special_token|> and similar, see
             # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
             # Without tokenizer these are disallowed in grammars.
             validate_guidance_grammar(params, tokenizer=None)
-        elif engine_level_backend == "outlines":
+        elif backend == "outlines":
             # outlines backend
             validate_structured_output_request_outlines(params)
-        elif engine_level_backend == "lm-format-enforcer":
+        elif backend == "lm-format-enforcer":
             # lm format enforcer backend
             validate_structured_output_request_lm_format_enforcer(params)
         else:
-            # NOTE: engine_level_backend must be "auto" here, because we have
+            # NOTE: backend must be "auto" here, because we have
             # checked supported_backends above.
             # "auto" is an opt-in to opinionated behavior where we try to
             # choose a backend based on request contents. This is not the

From 03962b2cefb6be39cda8cd2ff6a2d5159d3f7de5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 1 Sep 2025 17:20:41 +0200
Subject: [PATCH 09/43] `guided_decoding` -> `structured_outputs`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/structured_output/backend_lm_format_enforcer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py
index dbc2a59332ef..5f9925d209fe 100644
--- a/vllm/v1/structured_output/backend_lm_format_enforcer.py
+++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py
@@ -138,10 +138,10 @@ def destroy(self):
 
 def validate_structured_output_request_lm_format_enforcer(
         params: SamplingParams):
-    if params.guided_decoding is None:
+    if params.structured_outputs is None:
         return
 
-    so_params = params.guided_decoding
+    so_params = params.structured_outputs
 
     if so_params.regex:
         return

From ea8673513b5ac838f1d28a1d389a8a124718847e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 1 Sep 2025 17:32:33 +0200
Subject: [PATCH 10/43] Fix `protocol.py`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/openai/protocol.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b7b617dff503..b3f49a2c0517 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -639,7 +639,8 @@ def to_sampling_params(
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        structured_outputs = StructuredOutputsParams(**self.structured_outputs)
+        structured_outputs = StructuredOutputsParams(
+            **(self.structured_outputs or {}))
         if self.response_format is not None:
             if self.response_format.type == "json_object":
                 structured_outputs.json_object = True
@@ -828,8 +829,8 @@ def check_structured_outputs_count(cls, data):
         # you can only use one kind of constraints for structured outputs
         if count > 1:
             raise ValueError(
-                "You can only use one kind of constraints for structured outputs ('json', 'regex' or 'choice')."  # noqa: E501
-            )
+                "You can only use one kind of constraints for structured "
+                "outputs ('json', 'regex' or 'choice').")
         # you can only either use structured outputs or tools, not both
         if count > 1 and data.get("tool_choice", "none") not in (
                 "none",
@@ -837,8 +838,8 @@ def check_structured_outputs_count(cls, data):
                 "required",
         ):
             raise ValueError(
-                "You can only either use constraints for structured outputs or tools, not both."  # noqa: E501
-            )
+                "You can only either use constraints for structured outputs "
+                "or tools, not both.")
         return data
 
     @model_validator(mode="before")
@@ -1125,7 +1126,7 @@ def to_sampling_params(
         echo_without_generation = self.echo and self.max_tokens == 0
 
         structured_outputs_kwargs = StructuredOutputsParams(
-            **self.structured_outputs)
+            **(self.structured_outputs or {}))
         if (self.response_format is not None
                 and self.response_format.type == "json_object"):
             structured_outputs_kwargs.json_object = True
@@ -1192,8 +1193,8 @@ def check_structured_outputs_count(cls, data):
         ])
         if count > 1:
             raise ValueError(
-                "You can only use one kind of constraints for structured outputs ('json', 'regex' or 'choice')."  # noqa: E501
-            )
+                "You can only use one kind of constraints for structured "
+                "outputs ('json', 'regex' or 'choice').")
         return data
 
     @model_validator(mode="before")

From fc0ce57f3c166d3b04974e97b894347f20bd574f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 1 Sep 2025 17:34:53 +0200
Subject: [PATCH 11/43] Missing docstring

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/sampling_params.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index e4a79572390a..398eb93d3ec3 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -76,8 +76,8 @@ def __post_init__(self):
         ])
         if count > 1:
             raise ValueError(
-                f"You can only use one kind of structured outputs constraint but multiple are specified: {self.__dict__}"  # noqa: E501
-            )
+                "You can only use one kind of structured outputs constraint "
+                f"but multiple are specified: {self.__dict__}")
 
 
 class RequestOutputKind(Enum):
@@ -192,6 +192,7 @@ class SamplingParams(
 
     # Fields used to construct logits processors
     structured_outputs: Optional[StructuredOutputsParams] = None
+    """Parameters for configuring structured outputs."""
     logit_bias: Optional[dict[int, float]] = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""

From 36772c62e2c91e9dc7383f4ebe06231a06c508ec Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:29:14 +0200
Subject: [PATCH 12/43] Add missing backend selection

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 01fb5eb635ab..db5684ffaa3f 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -119,8 +119,9 @@ def test_structured_output(
     llm = LLM(model=model_name,
               enforce_eager=enforce_eager,
               max_model_len=1024,
-              structured_outputs_config=dict(
-                  disable_any_whitespace=backend in {"xgrammar", "guidance"}),
+              structured_outputs_config=dict(backend=backend,
+                                             disable_any_whitespace=backend
+                                             in {"xgrammar", "guidance"}),
               tokenizer_mode=tokenizer_mode,
               speculative_config=speculative_config)
 

From 6ac63c65628d858e3efa8b8d35aadfe57dd5bbc8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:34:57 +0200
Subject: [PATCH 13/43] Remove last references to `guided_`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/scripts/hardware_ci/run-amd-test.sh |  6 ------
 .github/mergify.yml                            |  1 -
 tests/entrypoints/openai/test_chat.py          | 18 ++++++++++--------
 .../llm/test_struct_output_generate.py         |  2 +-
 4 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index c395011a2448..7f90181048d0 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
   --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 
-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 495d207d4426..cc27947a4f0e 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -164,7 +164,6 @@ pull_request_rules:
       - files=examples/online_serving/openai_chat_completion_structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
       - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_guided_generate.py
       - files~=^vllm/v1/structured_output/
   actions:
     label:
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 4cb1eda87e0f..a01263b94955 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -487,8 +487,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices,
-                                  is_v1_server: bool):
+async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI,
+                                              sample_choices,
+                                              is_v1_server: bool):
     if not is_v1_server:
         pytest.skip("Guided decoding is only supported in v1 engine")
     messages = [{
@@ -526,8 +527,9 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices,
 
 
 @pytest.mark.asyncio
-async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
-                                is_v1_server: bool):
+async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
+                                            sample_json_schema,
+                                            is_v1_server: bool):
     if not is_v1_server:
         pytest.skip("Guided decoding is only supported in v1 engine")
 
@@ -572,8 +574,8 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
 
 
 @pytest.mark.asyncio
-async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
-                                 is_v1_server: bool):
+async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
+                                             sample_regex, is_v1_server: bool):
     if not is_v1_server:
         pytest.skip("Guided decoding is only supported in v1 engine")
 
@@ -632,8 +634,8 @@ async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           sample_choices):
+async def test_structured_outputs_choice_chat_logprobs(
+        client: openai.AsyncOpenAI, sample_choices):
 
     messages = [{
         "role": "system",
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index db5684ffaa3f..69ce5c008c82 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -742,7 +742,7 @@ def generate_with_backend(backend):
 
 
 @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
-def test_structured_output_batched_with_non_guided_requests(
+def test_structured_output_batched_with_non_structured_outputs_requests(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
     backend: str,

From 17c574daf779e28d47cf808b9510df31c4cbd0a5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:41:04 +0200
Subject: [PATCH 14/43] Fix arg utils

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 11034d700113..486013dabe98 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1341,9 +1341,6 @@ def create_engine_config(
 
         load_config = self.create_load_config()
 
-        structured_outputs_config = StructuredOutputsConfig(
-            **self.structured_outputs)
-
         observability_config = ObservabilityConfig(
             show_hidden_metrics_for_version=(
                 self.show_hidden_metrics_for_version),
@@ -1360,7 +1357,7 @@ def create_engine_config(
             lora_config=lora_config,
             speculative_config=speculative_config,
             load_config=load_config,
-            structured_outputs_config=structured_outputs_config,
+            structured_outputs_config=self.structured_outputs_config,
             observability_config=observability_config,
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,

From b0c2916b07bb200ed5cfe82d7d3f8ede84e87194 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:52:52 +0200
Subject: [PATCH 15/43] `reasoning_backend` -> `reasoning_parser`, fix
 `args.reasoning_parser`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../v1/entrypoints/llm/test_struct_output_generate.py  |  2 +-
 vllm/config/__init__.py                                |  2 +-
 vllm/entrypoints/openai/api_server.py                  | 10 +++++-----
 vllm/model_executor/models/config.py                   |  4 ++--
 vllm/v1/structured_output/__init__.py                  |  8 ++++----
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 69ce5c008c82..b989c96dc1ff 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -581,7 +581,7 @@ def test_structured_output_with_reasoning_matrices(
         max_num_seqs=16,
         backend=backend,
         structured_outputs_config=dict(disable_any_whitespace=True,
-                                       reasoning_backend=reasoning_parser),
+                                       reasoning_parser=reasoning_parser),
         tokenizer_mode=tokenizer_mode,
         speculative_config=speculative_config,
     )
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index f89e0985bc8a..bd4ad9b387fc 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -3081,7 +3081,7 @@ class StructuredOutputsConfig:
     in the JSON schema. This is only supported for the `guidance` backend and
     is used to better align its behaviour with `outlines` and `xgrammar`."""
 
-    reasoning_backend: str = ""
+    reasoning_parser: str = ""
     """Select the reasoning parser depending on the model that you're using.
     This is used to parse the reasoning content into OpenAI API format."""
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3cebfdf885be..88e06b2adf35 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1746,7 +1746,7 @@ async def init_app_state(
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
         tool_server=tool_server,
-        reasoning_parser=args.reasoning_parser,
+        reasoning_parser=args.structured_outputs_config.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
@@ -1765,7 +1765,7 @@ async def init_app_state(
         exclude_tools_when_tool_choice_none=args.
         exclude_tools_when_tool_choice_none,
         tool_parser=args.tool_call_parser,
-        reasoning_parser=args.reasoning_parser,
+        reasoning_parser=args.structured_outputs_config.reasoning_parser,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
@@ -1868,10 +1868,10 @@ def validate_api_server_args(args):
                        f"(chose from {{ {','.join(valid_tool_parses)} }})")
 
     valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys()
-    if args.reasoning_parser \
-        and args.reasoning_parser not in valid_reasoning_parses:
+    if ((reasoning_parser := args.structured_outputs_config.reasoning_parser)
+            and reasoning_parser not in valid_reasoning_parses):
         raise KeyError(
-            f"invalid reasoning parser: {args.reasoning_parser} "
+            f"invalid reasoning parser: {reasoning_parser} "
             f"(chose from {{ {','.join(valid_reasoning_parses)} }})")
 
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 6159f5c9a359..d2063f962a8d 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -253,8 +253,8 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig):
     @staticmethod
     def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         structured_outputs_config = vllm_config.structured_outputs_config
-        if structured_outputs_config.reasoning_backend == "":
-            structured_outputs_config.reasoning_backend = "GptOss"
+        if structured_outputs_config.reasoning_parser == "":
+            structured_outputs_config.reasoning_parser = "GptOss"
 
         # Increase the max capture size from 512 to 1024 for performance.
         # NOTE(woosuk): This will increase the number of CUDA graphs
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index c2bffc345d41..8ac5ea4129f7 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -64,11 +64,11 @@ def __init__(self, vllm_config: VllmConfig):
                 scheduler_config=self.vllm_config.scheduler_config,
                 lora_config=self.vllm_config.lora_config,
             ).get_lora_tokenizer(None)
-            reasoning_backend = \
-                    self.vllm_config.structured_outputs_config.reasoning_backend
-            if reasoning_backend:
+            reasoning_parser = \
+                    self.vllm_config.structured_outputs_config.reasoning_parser
+            if reasoning_parser:
                 reasoner_cls = ReasoningParserManager.get_reasoning_parser(
-                    reasoning_backend)
+                    reasoning_parser)
                 self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
     def grammar_init(self, request: Request) -> None:

From 48328d784694ba9da1c95613cf3df0eb2f8cf5af Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:58:22 +0200
Subject: [PATCH 16/43] Replace more instances of guided/guided decoding

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/entrypoints/openai/test_chat.py                |  8 ++++----
 .../openai/test_completion_with_function_calling.py  |  2 +-
 tests/entrypoints/openai/test_prompt_validation.py   |  2 +-
 .../openai/test_transcription_validation.py          |  2 +-
 .../openai/test_translation_validation.py            |  2 +-
 .../entrypoints/llm/test_struct_output_generate.py   | 12 ++++++------
 vllm/entrypoints/openai/serving_chat.py              |  2 +-
 vllm/transformers_utils/tokenizers/mistral.py        |  2 +-
 .../structured_output/backend_lm_format_enforcer.py  |  4 ++--
 9 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index e2d438f76311..38015053867d 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# imports for guided decoding tests
+# imports for structured outputs tests
 import json
 from typing import Optional
 
@@ -489,7 +489,7 @@ async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI,
                                               sample_choices,
                                               is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("Structured outputs is only supported in v1 engine")
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -529,7 +529,7 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
                                             sample_json_schema,
                                             is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("Structured outputs is only supported in v1 engine")
 
     messages = [{
         "role": "system",
@@ -575,7 +575,7 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
 async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
                                              sample_regex, is_v1_server: bool):
     if not is_v1_server:
-        pytest.skip("Guided decoding is only supported in v1 engine")
+        pytest.skip("Structured outputs is only supported in v1 engine")
 
     messages = [{
         "role": "system",
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index daaf441abac1..3649cefa9bf4 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -225,7 +225,7 @@ def k2_server():  # noqa: F811
         "--dtype",
         "half",
         "--enable-auto-tool-choice",
-        "--guided-decoding-backend",
+        "--structured-outputs-config.backend",
         "xgrammar",
         "--tool-call-parser",
         "hermes",
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index 4197583074df..895149b8d969 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -3,7 +3,7 @@
 
 import io
 
-# imports for guided decoding tests
+# imports for structured outputs tests
 import openai
 import pybase64
 import pytest
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 6a3cdfdfc808..23c99da97ad3 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# imports for guided decoding tests
+# imports for structured outputs tests
 import io
 import json
 
diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py
index f43b7a253d28..eb7879927b9b 100644
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import io
-# imports for guided decoding tests
+# imports for structured outputs tests
 import json
 
 import httpx
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b989c96dc1ff..52fac6173d23 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -762,14 +762,14 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
             disable_any_whitespace=(backend in {"xgrammar", "guidance"})),
     )
 
-    guided_prompt = (
+    structured_outputs_prompt = (
         "Give an example JSON for an employee profile that fits this "
         "schema. Make the response as short as possible. Schema: "
         f"{sample_json_schema}")
 
-    non_guided_prompt = "The diameter of the Earth in kilometers is "
+    non_structured_outputs_prompt = "The diameter of the Earth in kilometers is "
 
-    prompts = [guided_prompt, non_guided_prompt]
+    prompts = [structured_outputs_prompt, non_structured_outputs_prompt]
     sampling_params = [
         SamplingParams(temperature=1.0,
                        max_tokens=400,
@@ -805,16 +805,16 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
         print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}")
 
         if index == 0:
-            # First prompt is guided, expect valid JSON
+            # First prompt is structured outputs, expect valid JSON
             assert "\n" not in generated_text
             output_json = json.loads(generated_text)
             jsonschema.validate(instance=output_json,
                                 schema=sample_json_schema)
         else:
-            # Second prompt is not guided, expect valid output
+            # Second prompt is not structured outputs, expect valid output
             # Cannot assert on exact output, but we can expect it to be factual
             assert "12,742" in generated_text
 
-            # non-guided requests should not return a valid JSON here
+            # non-structured outputs requests should not return a valid JSON here
             with pytest.raises(ValueError):
                 output_json = json.loads(generated_text)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 35edd2f85cd0..5bee1c5a0d33 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -945,7 +945,7 @@ async def chat_completion_stream_generator(
                         # check to make sure we haven't "forgotten" to stream
                         #   any tokens that were generated but previously
                         #   matched by partial json parsing
-                        # only happens if we are NOT using guided decoding
+                        # only happens if we are NOT using structured outputs
                         auto_tools_called = False
                         if tool_parser:
                             auto_tools_called = len(
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 5a299b1adf34..479a24e2fee2 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -274,7 +274,7 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
         return tokenizer_file
 
     # the following attributes are set to fit vLLM's design and are used
-    # by the guided structured output backends.
+    # by the structured output backends.
     @property
     def all_special_tokens_extended(self) -> list[str]:
         from mistral_common.tokens.tokenizers.base import SpecialTokens
diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py
index 5f9925d209fe..465b2428f893 100644
--- a/vllm/v1/structured_output/backend_lm_format_enforcer.py
+++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py
@@ -157,11 +157,11 @@ def validate_structured_output_request_lm_format_enforcer(
                 json.dumps(so_params.json)
             except Exception as e:
                 raise ValueError(
-                    f"Error serializing guided decoding jsonschema: {e}"
+                    f"Error serializing structured outputs jsonschema: {e}"
                 ) from e
         return
     elif so_params.choice:
         return
     elif so_params.grammar:
-        raise ValueError("LM Format Enforcer guided decoding backend "
+        raise ValueError("LM Format Enforcer structured outputs backend "
                          "does not support grammar specifications")

From 8b35c084eead3116132caefabc8619dff4884c13 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 13:44:04 +0200
Subject: [PATCH 17/43] Remove `StructuredOutputsParams.from_optional` as it's
 not necessary

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/openai/protocol.py | 75 +++++++++++++----------------
 vllm/sampling_params.py             | 27 -----------
 2 files changed, 34 insertions(+), 68 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 50375d64156d..a696864f1f62 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -347,8 +347,9 @@ def to_sampling_params(
         structured_outputs = None
         if self.text is not None and self.text.format is not None:
             response_format = self.text.format
-            if response_format.type == "json_schema":
-                structured_outputs = StructuredOutputsParams.from_optional(
+            if (response_format.type == "json_schema"
+                    and response_format.schema_ is not None):
+                structured_outputs = StructuredOutputsParams(
                     json=response_format.schema_)
             elif response_format.type == "json_object":
                 raise NotImplementedError("json_object is not supported")
@@ -639,31 +640,28 @@ def to_sampling_params(
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        structured_outputs = StructuredOutputsParams(
-            **(self.structured_outputs or {}))
-        if self.response_format is not None:
-            if self.response_format.type == "json_object":
-                structured_outputs.json_object = True
-            elif self.response_format.type == "json_schema":
-                json_schema = self.response_format.json_schema
-                assert json_schema is not None
-                structured_outputs.json = json_schema.json_schema
-            elif self.response_format.type == "structural_tag":
-                structural_tag = self.response_format
-                assert structural_tag is not None and isinstance(
-                    structural_tag, StructuralTagResponseFormat)
-                s_tag_obj = structural_tag.model_dump(by_alias=True)
-                structured_outputs.structural_tag = json.dumps(s_tag_obj)
-
-        structured_outputs = StructuredOutputsParams.from_optional(
-            json=self._get_json_schema_from_tool() or structured_outputs.json,
-            regex=structured_outputs.regex,
-            choice=structured_outputs.choice,
-            grammar=structured_outputs.grammar,
-            json_object=structured_outputs.json_object,
-            whitespace_pattern=structured_outputs.whitespace_pattern,
-            structural_tag=structured_outputs.structural_tag,
-        )
+        structured_outputs = None
+        if (self.structured_outputs is not None
+                and any(v is not None
+                        for v in self.structured_outputs.values())):
+            structured_outputs = StructuredOutputsParams(
+                **self.structured_outputs)
+
+            if self.response_format is not None:
+                if self.response_format.type == "json_object":
+                    structured_outputs.json_object = True
+                elif self.response_format.type == "json_schema":
+                    json_schema = self.response_format.json_schema
+                    assert json_schema is not None
+                    structured_outputs.json = json_schema.json_schema
+                elif self.response_format.type == "structural_tag":
+                    structural_tag = self.response_format
+                    assert structural_tag is not None and isinstance(
+                        structural_tag, StructuralTagResponseFormat)
+                    s_tag_obj = structural_tag.model_dump(by_alias=True)
+                    structured_outputs.structural_tag = json.dumps(s_tag_obj)
+            if json_schema := self._get_json_schema_from_tool():
+                structured_outputs.json = json_schema
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -1125,20 +1123,15 @@ def to_sampling_params(
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        structured_outputs_kwargs = StructuredOutputsParams(
-            **(self.structured_outputs or {}))
-        if (self.response_format is not None
-                and self.response_format.type == "json_object"):
-            structured_outputs_kwargs.json_object = True
-
-        structured_outputs = StructuredOutputsParams.from_optional(
-            json=structured_outputs_kwargs.json,
-            regex=structured_outputs_kwargs.regex,
-            choice=structured_outputs_kwargs.choice,
-            grammar=structured_outputs_kwargs.grammar,
-            json_object=structured_outputs_kwargs.json_object,
-            whitespace_pattern=structured_outputs_kwargs.whitespace_pattern,
-        )
+        structured_outputs = None
+        if (self.structured_outputs is not None
+                and any(v is not None
+                        for v in self.structured_outputs.values())):
+            structured_outputs = StructuredOutputsParams(
+                **self.structured_outputs)
+            if (self.response_format is not None
+                    and self.response_format.type == "json_object"):
+                structured_outputs.json_object = True
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 398eb93d3ec3..94c8497028cc 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -8,7 +8,6 @@
 from typing import Annotated, Any, Optional, Union
 
 import msgspec
-from pydantic import BaseModel
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
@@ -42,32 +41,6 @@ class StructuredOutputsParams:
     whitespace_pattern: Optional[str] = None
     structural_tag: Optional[str] = None
 
-    @staticmethod
-    def from_optional(
-        json: Optional[Union[dict, BaseModel, str]] = None,
-        regex: Optional[str] = None,
-        choice: Optional[list[str]] = None,
-        grammar: Optional[str] = None,
-        json_object: Optional[bool] = None,
-        whitespace_pattern: Optional[str] = None,
-        structural_tag: Optional[str] = None,
-    ) -> Optional["StructuredOutputsParams"]:
-        if all(arg is None for arg in (json, regex, choice, grammar,
-                                       json_object, structural_tag)):
-            return None
-        # Extract json schemas from pydantic models
-        if isinstance(json, (BaseModel, type(BaseModel))):
-            json = json.model_json_schema()
-        return StructuredOutputsParams(
-            json=json,
-            regex=regex,
-            choice=choice,
-            grammar=grammar,
-            json_object=json_object,
-            whitespace_pattern=whitespace_pattern,
-            structural_tag=structural_tag,
-        )
-
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
         count = sum([

From f8947e12f5b92056c5b0b06de6306467da2ae62e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 13:46:44 +0200
Subject: [PATCH 18/43] Fix tests

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/llm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 4d64f231bcc4..d2ee7b15b641 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -248,6 +248,8 @@ def __init__(
                     )))
             else:
                 structured_outputs_instance = structured_outputs_config
+        else:
+            structured_outputs_instance = StructuredOutputsConfig()
 
         engine_args = EngineArgs(
             model=model,

From ec81c2bc4efbdfcec24277dbcb68ab06be256428 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 14:02:21 +0200
Subject: [PATCH 19/43] pre-commit

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/openai/protocol.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a696864f1f62..e91e46c333a9 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -660,8 +660,8 @@ def to_sampling_params(
                         structural_tag, StructuralTagResponseFormat)
                     s_tag_obj = structural_tag.model_dump(by_alias=True)
                     structured_outputs.structural_tag = json.dumps(s_tag_obj)
-            if json_schema := self._get_json_schema_from_tool():
-                structured_outputs.json = json_schema
+            if structured_outputs_json := self._get_json_schema_from_tool():
+                structured_outputs.json = structured_outputs_json
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -700,8 +700,7 @@ def to_sampling_params(
             extra_args=extra_args or None,
         )
 
-    def _get_json_schema_from_tool(
-            self) -> Optional[Union[str, dict, BaseModel]]:
+    def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]:
         # user has chosen to not use any tool
         if self.tool_choice == "none" or self.tools is None:
             return None

From b991f92d3e3ea44b6b099be21a6fb7a301b31c1c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 16:00:27 +0200
Subject: [PATCH 20/43] Fix tests

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 52fac6173d23..57490112f3df 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -579,8 +579,9 @@ def test_structured_output_with_reasoning_matrices(
         enforce_eager=bool(not current_platform.is_tpu()),
         max_model_len=1024,
         max_num_seqs=16,
-        backend=backend,
-        structured_outputs_config=dict(disable_any_whitespace=True,
+        structured_outputs_config=dict(backend=backend,
+                                       disable_any_whitespace=backend
+                                       not in {"xgrammar", "guidance"},
                                        reasoning_parser=reasoning_parser),
         tokenizer_mode=tokenizer_mode,
         speculative_config=speculative_config,
@@ -643,7 +644,7 @@ def test_structured_output_auto_mode(
 
     llm = LLM(model=model_name,
               max_model_len=1024,
-              backend="auto",
+              structured_outputs=dict(backend="auto"),
               tokenizer_mode=tokenizer_mode)
 
     sampling_params = SamplingParams(
@@ -686,6 +687,7 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
     llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
               max_model_len=1024,
               structured_outputs_config=dict(
+                  backend="guidance",
                   disable_any_whitespace=True,
                   disable_additional_properties=True))
 

From ce7390330dd8d4bd324a52638ff69ff2599dfb3f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 16:07:08 +0200
Subject: [PATCH 21/43] Fix test pipeline for removed file

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 55349e0ac932..e248e73ae954 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -109,8 +109,7 @@ steps:
   - tests/entrypoints/offline_mode
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 

From 37e77514b87d4c1e55f16286aebc71484b5a8908 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 2 Sep 2025 16:10:09 +0200
Subject: [PATCH 22/43] `--reasoning-parser` ->
 `--structured-outputs-config.reasoning_parser`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/reasoning_outputs.md                     | 10 +++++-----
 docs/features/structured_outputs.md                    |  2 +-
 docs/features/tool_calling.md                          |  2 +-
 ...openai_chat_completion_tool_calls_with_reasoning.py |  2 +-
 .../openai_chat_completion_with_reasoning.py           |  2 +-
 .../openai_chat_completion_with_reasoning_streaming.py |  2 +-
 examples/online_serving/structured_outputs/README.md   |  2 +-
 .../openai/test_chat_with_tool_reasoning.py            |  6 +++---
 tests/entrypoints/openai/test_cli_args.py              |  4 ++--
 .../openai/test_completion_with_function_calling.py    |  4 ++--
 tests/v1/entrypoints/openai/responses/conftest.py      |  2 +-
 11 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 4b4422f4bf1f..377bc212797c 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -22,11 +22,11 @@ vLLM currently supports the following reasoning models:
 
 ## Quickstart
 
-To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
+To use reasoning models, you need to specify the `--structured-outputs-config.reasoning_parser` flags when making a request to the chat completion endpoint. The `--structured-outputs-config.reasoning_parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --reasoning-parser deepseek_r1
+    --structured-outputs-config.reasoning_parser deepseek_r1
 ```
 
 Next, make a request to the model that should return the reasoning content in the response.
@@ -208,7 +208,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
 
     # define a reasoning parser and register it to vllm
     # the name list in register_module can be used
-    # in --reasoning-parser.
+    # in --structured-outputs-config.reasoning_parser.
     @ReasoningParserManager.register_module(["example"])
     class ExampleParser(ReasoningParser):
         def __init__(self, tokenizer: AnyTokenizer):
@@ -283,8 +283,8 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
 
 The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
 
-Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.
+Finally, you can enable reasoning for the model by using the `--structured-outputs-config.reasoning_parser` flags.
 
 ```bash
-vllm serve <model_tag> --reasoning-parser example
+vllm serve <model_tag> --structured-outputs-config.reasoning_parser example
 ```
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 1f955c6e30d6..af23d7108975 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -161,7 +161,7 @@ See also: [full example](../examples/online_serving/structured_outputs.md)
 You can also use structured outputs with <project:#reasoning-outputs> for reasoning models.
 
 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --structured-outputs-config.reasoning_parser deepseek_r1
 ```
 
 Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 35b01ef55b19..e814e3b497aa 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -308,7 +308,7 @@ Supported models:
 Flags:
 
 * For non-reasoning: `--tool-call-parser hunyuan_a13b`
-* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
+* For reasoning: `--tool-call-parser hunyuan_a13b --structured-outputs-config.reasoning_parser hunyuan_a13b --enable_reasoning`
 
 ### Models with Pythonic Tool Calls (`pythonic`)
 
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index 4006d07f73b0..cd3a7eb2b51f 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -10,7 +10,7 @@
 
 ```bash
 vllm serve Qwen/QwQ-32B \
-     --reasoning-parser deepseek_r1 \
+     --structured-outputs-config.reasoning_parser deepseek_r1 \
      --enable-auto-tool-choice --tool-call-parser hermes
      
 ```
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index 932dbeb2e7a2..8b934704442f 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -9,7 +9,7 @@
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --reasoning-parser deepseek_r1
+    --structured-outputs-config.reasoning_parser deepseek_r1
 ```
 
 This example demonstrates how to generate chat completions from reasoning models
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 7d1ea3771459..e952bf7ab0f4 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -9,7 +9,7 @@
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-     --reasoning-parser deepseek_r1
+     --structured-outputs-config.reasoning_parser deepseek_r1
 ```
 
 Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md
index d2777a43d478..bde4fa105808 100644
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@@ -14,7 +14,7 @@ To serve a reasoning model, you can use the following command:
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
-    --reasoning-parser deepseek_r1
+    --structured-outputs-config.reasoning_parser deepseek_r1
 ```
 
 If you want to run this script standalone with `uv`, you can use the following:
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index 03730b67283c..6d7067159894 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -14,9 +14,9 @@
 @pytest.fixture(scope="module")
 def server():  # noqa: F811
     args = [
-        "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
-        "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
-        "hermes"
+        "--max-model-len", "8192", "--enforce-eager",
+        "--structured-outputs-config.reasoning_parser", "deepseek_r1",
+        "--enable-auto-tool-choice", "--tool-call-parser", "hermes"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 9a1c0ea13b54..f22008f44d15 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -145,7 +145,7 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
     """Ensure validation fails if reasoning is enabled with auto tool choice"""
     args = serve_parser.parse_args(args=[
         "--enable-auto-tool-choice",
-        "--reasoning-parser",
+        "--structured-outputs-config.reasoning_parser",
         "deepseek_r1",
     ])
     with pytest.raises(TypeError):
@@ -156,7 +156,7 @@ def test_passes_with_reasoning_parser(serve_parser):
     """Ensure validation passes if reasoning is enabled 
     with a reasoning parser"""
     args = serve_parser.parse_args(args=[
-        "--reasoning-parser",
+        "--structured-outputs-config.reasoning_parser",
         "deepseek_r1",
     ])
     validate_parsed_serve_args(args)
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 3649cefa9bf4..8025f78e2c61 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -146,7 +146,7 @@ def server():  # noqa: F811
         "xgrammar",
         "--tool-call-parser",
         "hermes",
-        "--reasoning-parser",
+        "--structured-outputs-config.reasoning_parser",
         "qwen3",
         "--gpu-memory-utilization",
         "0.4"
@@ -229,7 +229,7 @@ def k2_server():  # noqa: F811
         "xgrammar",
         "--tool-call-parser",
         "hermes",
-        "--reasoning-parser",
+        "--structured-outputs-config.reasoning_parser",
         "qwen3",
         "--gpu-memory-utilization",
         "0.4",
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py
index 2d677a00b646..f33e590b7296 100644
--- a/tests/v1/entrypoints/openai/responses/conftest.py
+++ b/tests/v1/entrypoints/openai/responses/conftest.py
@@ -15,7 +15,7 @@ def default_server_args():
         "--max-model-len",
         "8192",
         "--enforce-eager",  # For faster startup.
-        "--reasoning-parser",
+        "--structured-outputs-config.reasoning_parser",
         "deepseek_r1",
     ]
 

From f85abd756820b40428b9d263be752f61ef9d5c38 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 14:35:37 +0200
Subject: [PATCH 23/43] Add reasoning parser back as an `InitVar`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index dcaa534b5850..ca67a90dfe63 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -8,7 +8,7 @@
 import functools
 import json
 import sys
-from dataclasses import MISSING, dataclass, fields, is_dataclass
+from dataclasses import MISSING, InitVar, dataclass, fields, is_dataclass
 from itertools import permutations
 from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
                     Literal, Optional, Type, TypeVar, Union, cast, get_args,
@@ -39,6 +39,7 @@
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_ray_initialized
+from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.config import get_model_path, is_interleaved
 from vllm.transformers_utils.utils import check_gguf_file
@@ -415,6 +416,7 @@ class EngineArgs:
 
     structured_outputs_config: StructuredOutputsConfig = get_field(
         VllmConfig, "structured_outputs_config")
+    reasoning_parser: InitVar[str] = StructuredOutputsConfig.reasoning_parser
 
     logits_processor_pattern: Optional[
         str] = ModelConfig.logits_processor_pattern
@@ -470,7 +472,8 @@ class EngineArgs:
     kv_sharing_fast_prefill: bool = \
         CacheConfig.kv_sharing_fast_prefill
 
-    def __post_init__(self):
+    def __post_init__(self, reasoning_parser: str):
+        self.structured_outputs_config.reasoning_parser = reasoning_parser
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
@@ -609,6 +612,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         load_group.add_argument('--pt-load-map-location',
                                 **load_kwargs["pt_load_map_location"])
 
+        # Structured outputs arguments
+        structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig)
+        structured_outputs_group = parser.add_argument_group(
+            title="StructuredOutputsConfig",
+            description=StructuredOutputsConfig.__doc__,
+        )
+        structured_outputs_group.add_argument(
+            "--reasoning-parser",
+            # This choice is a special case because it's not static
+            choices=list(ReasoningParserManager.reasoning_parsers),
+            **structured_outputs_kwargs["reasoning_parser"])
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
         parallel_group = parser.add_argument_group(

From 85b3b4305feda7a306f404555340d1c16b88907a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 14:37:53 +0200
Subject: [PATCH 24/43] `pre-commit`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/llm_engine.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ee211fcc1b01..03f96e196e9b 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -213,6 +213,7 @@ def __init__(
         self.device_config = vllm_config.device_config
         self.speculative_config = vllm_config.speculative_config  # noqa
         self.load_config = vllm_config.load_config
+        self.structured_outputs_config = vllm_config.structured_outputs_config
         self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
         )
 
@@ -371,10 +372,9 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
                 self.observability_config.otlp_traces_endpoint)
 
         # Initialize reasoning parser if reasoning backend is set.
-        if self.decoding_config.reasoning_backend and \
-                self.tokenizer:
+        if self.structured_outputs_config.reasoning_parser and self.tokenizer:
             reasoner_class = ReasoningParserManager.get_reasoning_parser(
-                self.decoding_config.reasoning_backend)
+                self.structured_outputs_config.reasoning_parser)
             self.reasoner: ReasoningParser = reasoner_class(
                 self.tokenizer.get_lora_tokenizer())
 
@@ -390,7 +390,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
                 stop_checker=StopChecker(
                     self.scheduler_config.max_model_len,
                     get_tokenizer_for_seq,
-                    self.reasoner if self.decoding_config.reasoning_backend
+                    self.reasoner
+                    if self.structured_outputs_config.reasoning_parser
                     and self.tokenizer else None,
                 ),
             ))

From b4f70f73f9740a14001aed92813eef67f7890375 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 14:41:46 +0200
Subject: [PATCH 25/43] Reinstate deprecated guided decoding CLI args

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ca67a90dfe63..126a1b74e3e0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -417,6 +417,11 @@ class EngineArgs:
     structured_outputs_config: StructuredOutputsConfig = get_field(
         VllmConfig, "structured_outputs_config")
     reasoning_parser: InitVar[str] = StructuredOutputsConfig.reasoning_parser
+    # Deprecated guided decoding fields
+    guided_decoding_backend: str = None
+    guided_decoding_disable_fallback: bool = None
+    guided_decoding_disable_any_whitespace: bool = None
+    guided_decoding_disable_additional_properties: bool = None
 
     logits_processor_pattern: Optional[
         str] = ModelConfig.logits_processor_pattern
@@ -623,6 +628,19 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             # This choice is a special case because it's not static
             choices=list(ReasoningParserManager.reasoning_parsers),
             **structured_outputs_kwargs["reasoning_parser"])
+        # Deprecated guided decoding arguments
+        for arg, type in [
+            ("--guided-decoding-backend", str),
+            ("--guided-decoding-disable-fallback", bool),
+            ("--guided-decoding-disable-any-whitespace", bool),
+            ("--guided-decoding-disable-additional-properties", bool),
+        ]:
+            structured_outputs_group.add_argument(
+                arg,
+                type=type,
+                help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
+                deprecated=True)
+
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
         parallel_group = parser.add_argument_group(
@@ -1399,6 +1417,21 @@ def create_engine_config(
 
         load_config = self.create_load_config()
 
+        # Forward the deprecated CLI args to the StructuredOutputsConfig
+        so_config = self.structured_outputs_config
+        if self.guided_decoding_backend is not None:
+            so_config.guided_decoding_backend = \
+            self.guided_decoding_backend
+        if self.guided_decoding_disable_fallback is not None:
+            so_config.guided_decoding_disable_fallback = \
+            self.guided_decoding_disable_fallback
+        if self.guided_decoding_disable_any_whitespace is not None:
+            so_config.guided_decoding_disable_any_whitespace = \
+            self.guided_decoding_disable_any_whitespace
+        if self.guided_decoding_disable_additional_properties is not None:
+            so_config.guided_decoding_disable_additional_properties = \
+            self.guided_decoding_disable_additional_properties
+
         observability_config = ObservabilityConfig(
             show_hidden_metrics_for_version=(
                 self.show_hidden_metrics_for_version),

From b52aab9d711ec599c003fafb9a53bf040b733ea4 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 14:51:35 +0200
Subject: [PATCH 26/43] `--structured-outputs-config.reasoning_parser` ->
 `--reasoning-parser`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/reasoning_outputs.md                     | 10 +++++-----
 docs/features/structured_outputs.md                    |  2 +-
 docs/features/tool_calling.md                          |  2 +-
 ...openai_chat_completion_tool_calls_with_reasoning.py |  2 +-
 .../openai_chat_completion_with_reasoning.py           |  2 +-
 .../openai_chat_completion_with_reasoning_streaming.py |  2 +-
 examples/online_serving/structured_outputs/README.md   |  2 +-
 .../openai/test_chat_with_tool_reasoning.py            |  6 +++---
 tests/entrypoints/openai/test_cli_args.py              |  4 ++--
 .../openai/test_completion_with_function_calling.py    |  4 ++--
 tests/v1/entrypoints/openai/responses/conftest.py      |  2 +-
 11 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 329b174af49e..85681669dfb2 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -23,11 +23,11 @@ vLLM currently supports the following reasoning models:
 
 ## Quickstart
 
-To use reasoning models, you need to specify the `--structured-outputs-config.reasoning_parser` flags when making a request to the chat completion endpoint. The `--structured-outputs-config.reasoning_parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
+To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --structured-outputs-config.reasoning_parser deepseek_r1
+    --reasoning-parser deepseek_r1
 ```
 
 Next, make a request to the model that should return the reasoning content in the response.
@@ -209,7 +209,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
 
     # define a reasoning parser and register it to vllm
     # the name list in register_module can be used
-    # in --structured-outputs-config.reasoning_parser.
+    # in --reasoning-parser.
     @ReasoningParserManager.register_module(["example"])
     class ExampleParser(ReasoningParser):
         def __init__(self, tokenizer: AnyTokenizer):
@@ -284,8 +284,8 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
 
 The structured output engine like [xgrammar](https://github.com/mlc-ai/xgrammar) will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
 
-Finally, you can enable reasoning for the model by using the `--structured-outputs-config.reasoning_parser` flags.
+Finally, you can enable reasoning for the model by using the `--reasoning-parser` flags.
 
 ```bash
-vllm serve <model_tag> --structured-outputs-config.reasoning_parser example
+vllm serve <model_tag> --reasoning-parser example
 ```
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index af23d7108975..1f955c6e30d6 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -161,7 +161,7 @@ See also: [full example](../examples/online_serving/structured_outputs.md)
 You can also use structured outputs with <project:#reasoning-outputs> for reasoning models.
 
 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --structured-outputs-config.reasoning_parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1
 ```
 
 Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 01f9ad62908c..720102ff9ea3 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -308,7 +308,7 @@ Supported models:
 Flags:
 
 * For non-reasoning: `--tool-call-parser hunyuan_a13b`
-* For reasoning: `--tool-call-parser hunyuan_a13b --structured-outputs-config.reasoning_parser hunyuan_a13b --enable_reasoning`
+* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning`
 
 ### GLM-4.5 Models (`glm45`)
 
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index cd3a7eb2b51f..4006d07f73b0 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -10,7 +10,7 @@
 
 ```bash
 vllm serve Qwen/QwQ-32B \
-     --structured-outputs-config.reasoning_parser deepseek_r1 \
+     --reasoning-parser deepseek_r1 \
      --enable-auto-tool-choice --tool-call-parser hermes
      
 ```
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index 8b934704442f..932dbeb2e7a2 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -9,7 +9,7 @@
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --structured-outputs-config.reasoning_parser deepseek_r1
+    --reasoning-parser deepseek_r1
 ```
 
 This example demonstrates how to generate chat completions from reasoning models
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index e952bf7ab0f4..7d1ea3771459 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -9,7 +9,7 @@
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-     --structured-outputs-config.reasoning_parser deepseek_r1
+     --reasoning-parser deepseek_r1
 ```
 
 Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the
diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md
index bde4fa105808..d2777a43d478 100644
--- a/examples/online_serving/structured_outputs/README.md
+++ b/examples/online_serving/structured_outputs/README.md
@@ -14,7 +14,7 @@ To serve a reasoning model, you can use the following command:
 
 ```bash
 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \
-    --structured-outputs-config.reasoning_parser deepseek_r1
+    --reasoning-parser deepseek_r1
 ```
 
 If you want to run this script standalone with `uv`, you can use the following:
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
index 6d7067159894..03730b67283c 100644
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -14,9 +14,9 @@
 @pytest.fixture(scope="module")
 def server():  # noqa: F811
     args = [
-        "--max-model-len", "8192", "--enforce-eager",
-        "--structured-outputs-config.reasoning_parser", "deepseek_r1",
-        "--enable-auto-tool-choice", "--tool-call-parser", "hermes"
+        "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser",
+        "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser",
+        "hermes"
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index f22008f44d15..9a1c0ea13b54 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -145,7 +145,7 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
     """Ensure validation fails if reasoning is enabled with auto tool choice"""
     args = serve_parser.parse_args(args=[
         "--enable-auto-tool-choice",
-        "--structured-outputs-config.reasoning_parser",
+        "--reasoning-parser",
         "deepseek_r1",
     ])
     with pytest.raises(TypeError):
@@ -156,7 +156,7 @@ def test_passes_with_reasoning_parser(serve_parser):
     """Ensure validation passes if reasoning is enabled 
     with a reasoning parser"""
     args = serve_parser.parse_args(args=[
-        "--structured-outputs-config.reasoning_parser",
+        "--reasoning-parser",
         "deepseek_r1",
     ])
     validate_parsed_serve_args(args)
diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py
index 8025f78e2c61..3649cefa9bf4 100644
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@@ -146,7 +146,7 @@ def server():  # noqa: F811
         "xgrammar",
         "--tool-call-parser",
         "hermes",
-        "--structured-outputs-config.reasoning_parser",
+        "--reasoning-parser",
         "qwen3",
         "--gpu-memory-utilization",
         "0.4"
@@ -229,7 +229,7 @@ def k2_server():  # noqa: F811
         "xgrammar",
         "--tool-call-parser",
         "hermes",
-        "--structured-outputs-config.reasoning_parser",
+        "--reasoning-parser",
         "qwen3",
         "--gpu-memory-utilization",
         "0.4",
diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py
index f33e590b7296..2d677a00b646 100644
--- a/tests/v1/entrypoints/openai/responses/conftest.py
+++ b/tests/v1/entrypoints/openai/responses/conftest.py
@@ -15,7 +15,7 @@ def default_server_args():
         "--max-model-len",
         "8192",
         "--enforce-eager",  # For faster startup.
-        "--structured-outputs-config.reasoning_parser",
+        "--reasoning-parser",
         "deepseek_r1",
     ]
 

From ac75b2a6fb050984a03cfbd0362a46e6686109e5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 14:57:10 +0200
Subject: [PATCH 27/43] `InitVar` didn't work

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 126a1b74e3e0..012a917f975a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -8,7 +8,7 @@
 import functools
 import json
 import sys
-from dataclasses import MISSING, InitVar, dataclass, fields, is_dataclass
+from dataclasses import MISSING, dataclass, fields, is_dataclass
 from itertools import permutations
 from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List,
                     Literal, Optional, Type, TypeVar, Union, cast, get_args,
@@ -416,7 +416,7 @@ class EngineArgs:
 
     structured_outputs_config: StructuredOutputsConfig = get_field(
         VllmConfig, "structured_outputs_config")
-    reasoning_parser: InitVar[str] = StructuredOutputsConfig.reasoning_parser
+    reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     # Deprecated guided decoding fields
     guided_decoding_backend: str = None
     guided_decoding_disable_fallback: bool = None
@@ -477,8 +477,7 @@ class EngineArgs:
     kv_sharing_fast_prefill: bool = \
         CacheConfig.kv_sharing_fast_prefill
 
-    def __post_init__(self, reasoning_parser: str):
-        self.structured_outputs_config.reasoning_parser = reasoning_parser
+    def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
@@ -1417,6 +1416,11 @@ def create_engine_config(
 
         load_config = self.create_load_config()
 
+        # Pass reasoning_parser into StructuredOutputsConfig
+        if self.reasoning_parser:
+            self.structured_outputs_config.reasoning_parser = \
+                self.reasoning_parser
+
         # Forward the deprecated CLI args to the StructuredOutputsConfig
         so_config = self.structured_outputs_config
         if self.guided_decoding_backend is not None:

From 292903653b5f46b75c119ed309138b56a3aef9e7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 14:57:43 +0200
Subject: [PATCH 28/43] `pre-commit`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 012a917f975a..5ff28dd4e248 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -418,10 +418,10 @@ class EngineArgs:
         VllmConfig, "structured_outputs_config")
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     # Deprecated guided decoding fields
-    guided_decoding_backend: str = None
-    guided_decoding_disable_fallback: bool = None
-    guided_decoding_disable_any_whitespace: bool = None
-    guided_decoding_disable_additional_properties: bool = None
+    guided_decoding_backend: Optional[str] = None
+    guided_decoding_disable_fallback: Optional[bool] = None
+    guided_decoding_disable_any_whitespace: Optional[bool] = None
+    guided_decoding_disable_additional_properties: Optional[bool] = None
 
     logits_processor_pattern: Optional[
         str] = ModelConfig.logits_processor_pattern

From 295ac17ec036e9c34b51e3baab205d8388e6a98e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:06:10 +0200
Subject: [PATCH 29/43] `sample_choices` -> `sample_structured_outputs_choices`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/entrypoints/conftest.py                 |  2 +-
 tests/entrypoints/openai/test_chat.py         | 76 ++++++++++++++++---
 tests/entrypoints/openai/test_completion.py   |  7 +-
 tests/v1/entrypoints/conftest.py              |  2 +-
 .../llm/test_struct_output_generate.py        |  7 +-
 5 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index 88591b5eba09..30f2d67588fe 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -184,7 +184,7 @@ def sample_enum_json_schema():
 
 
 @pytest.fixture
-def sample_choices():
+def sample_structured_outputs_choices():
     return [
         "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
         "Ruby", "Swift", "Kotlin"
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 35a896d0951e..08c5b37e683b 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -12,7 +12,7 @@
 import regex as re
 import requests
 import torch
-from openai import BadRequestError
+from openai import BadRequestError, OpenAI
 
 from ...utils import RemoteOpenAIServer
 
@@ -485,9 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI,
-                                              sample_choices,
-                                              is_v1_server: bool):
+async def test_structured_outputs_choice_chat(
+        client: openai.AsyncOpenAI, sample_structured_outputs_choices,
+        is_v1_server: bool):
     if not is_v1_server:
         pytest.skip("Structured outputs is only supported in v1 engine")
     messages = [{
@@ -504,9 +504,10 @@ async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(structured_outputs={"choice": sample_choices}))
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}))
     choice1 = chat_completion.choices[0].message.content
-    assert choice1 in sample_choices
+    assert choice1 in sample_structured_outputs_choices
 
     messages.append({"role": "assistant", "content": choice1})
     messages.append({
@@ -518,9 +519,10 @@ async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(structured_outputs={"choice": sample_choices}))
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}))
     choice2 = chat_completion.choices[0].message.content
-    assert choice2 in sample_choices
+    assert choice2 in sample_structured_outputs_choices
     assert choice1 != choice2
 
 
@@ -633,7 +635,7 @@ async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
 
 @pytest.mark.asyncio
 async def test_structured_outputs_choice_chat_logprobs(
-        client: openai.AsyncOpenAI, sample_choices):
+        client: openai.AsyncOpenAI, sample_structured_outputs_choices):
 
     messages = [{
         "role": "system",
@@ -650,7 +652,8 @@ async def test_structured_outputs_choice_chat_logprobs(
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(structured_outputs={"choice": sample_choices}))
+        extra_body=dict(
+            structured_outputs={"choice": sample_structured_outputs_choices}))
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
@@ -972,6 +975,59 @@ async def test_long_seed(client: openai.AsyncOpenAI):
                 or "less_than_equal" in exc_info.value.message)
 
 
+@pytest.mark.asyncio
+async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
+    url = f"http://localhost:{server.port}/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json",
+    }
+    data = {
+        # model_name is avoided here.
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        "max_tokens":
+        5
+    }
+
+    response = requests.post(url, headers=headers, json=data)
+    response_data = response.json()
+    print(response_data)
+    assert response_data.get("model") == MODEL_NAME
+    choice = response_data.get("choices")[0]
+    message = choice.get("message")
+    assert message is not None
+    content = message.get("content")
+    assert content is not None
+    assert len(content) > 0
+
+
+@pytest.mark.asyncio
+async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer):
+    openai_api_key = "EMPTY"
+    openai_api_base = f"http://localhost:{server.port}/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": "Hello, vLLM!"
+        },
+    ]
+    response = client.chat.completions.create(
+        model="",  # empty string
+        messages=messages,
+    )
+    assert response.model == MODEL_NAME
+
+
 @pytest.mark.asyncio
 async def test_invocations(server: RemoteOpenAIServer,
                            client: openai.AsyncOpenAI):
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index e0fa8f2a9cce..aca88399e1f2 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -659,7 +659,7 @@ async def test_structured_outputs_regex_completion(
 @pytest.mark.asyncio
 async def test_structured_outputs_choice_completion(
     client: openai.AsyncOpenAI,
-    sample_choices,
+    sample_structured_outputs_choices,
     is_v1_server: bool,
 ):
     if not is_v1_server:
@@ -671,12 +671,13 @@ async def test_structured_outputs_choice_completion(
         n=2,
         temperature=1.0,
         max_tokens=10,
-        extra_body=dict(structured_outputs=dict(choice=sample_choices)))
+        extra_body=dict(structured_outputs=dict(
+            choice=sample_structured_outputs_choices)))
 
     assert completion.id is not None
     assert len(completion.choices) == 2
     for i in range(2):
-        assert completion.choices[i].text in sample_choices
+        assert completion.choices[i].text in sample_structured_outputs_choices
 
 
 @pytest.mark.asyncio
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index 08d50e3fc928..46b953fe3743 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -151,7 +151,7 @@ def sample_definition_json_schema():
 
 
 @pytest.fixture
-def sample_choices():
+def sample_structured_outputs_choices():
     return [
         "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
         "Ruby", "Swift", "Kotlin"
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b0690815c2d1..4db4ba4fca83 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -100,7 +100,7 @@ def test_structured_output(
     sample_sql_ebnf: str,
     sample_sql_lark: str,
     sample_regex: str,
-    sample_choices: str,
+    sample_structured_outputs_choices: str,
     backend: str,
     tokenizer_mode: str,
     model_name: str,
@@ -356,7 +356,8 @@ def test_structured_output(
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        structured_outputs=StructuredOutputsParams(choice=sample_choices))
+        structured_outputs=StructuredOutputsParams(
+            choice=sample_structured_outputs_choices))
 
     outputs = llm.generate(
         ("The best language for type-safe systems programming is "
@@ -372,7 +373,7 @@ def test_structured_output(
         generated_text = output.outputs[0].text
         print(generated_text)
         assert generated_text is not None
-        assert generated_text in sample_choices
+        assert generated_text in sample_structured_outputs_choices
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
     #

From 6954712590bd2fbbfdcc728c9785a27274f3edb8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:08:26 +0200
Subject: [PATCH 30/43] Update mergify path

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .github/mergify.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 7448b2de94a3..94198b1251e0 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -171,6 +171,7 @@ pull_request_rules:
       - files=examples/online_serving/openai_chat_completion_structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
       - files~=^tests/v1/structured_output/
+      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
       - files~=^vllm/v1/structured_output/
   actions:
     label:

From 49b6d893bbb995335cd361868497fac1a34d9f77 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:10:38 +0200
Subject: [PATCH 31/43] Fix wrong kwarg

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 4db4ba4fca83..d0e0d4e77896 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -646,7 +646,7 @@ def test_structured_output_auto_mode(
 
     llm = LLM(model=model_name,
               max_model_len=1024,
-              structured_outputs=dict(backend="auto"),
+              structured_outputs_config=dict(backend="auto"),
               tokenizer_mode=tokenizer_mode)
 
     sampling_params = SamplingParams(

From e36177256a76d5bcc8fceb58ec994d38ea1073b6 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:15:28 +0200
Subject: [PATCH 32/43] Simplify dict -> config

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/llm.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 8ec37d183c20..19b3e902dc37 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -238,9 +238,12 @@ def __init__(
                 compilation_config_instance = CompilationConfig(
                     level=compilation_config)
             elif isinstance(compilation_config, dict):
-                predicate = lambda x: is_init_field(CompilationConfig, x[0])
                 compilation_config_instance = CompilationConfig(
-                    **dict(filter(predicate, compilation_config.items())))
+                    **{
+                        k: v
+                        for k, v in compilation_config.items()
+                        if is_init_field(CompilationConfig, k)
+                    })
             else:
                 compilation_config_instance = compilation_config
         else:
@@ -248,13 +251,12 @@ def __init__(
 
         if structured_outputs_config is not None:
             if isinstance(structured_outputs_config, dict):
-                predicate = lambda x: is_init_field(StructuredOutputsConfig, x[
-                    0])
-                structured_outputs_instance = StructuredOutputsConfig(**dict(
-                    filter(
-                        predicate,
-                        structured_outputs_config.items(),
-                    )))
+                structured_outputs_instance = StructuredOutputsConfig(
+                    **{
+                        k: v
+                        for k, v in structured_outputs_config.items()
+                        if is_init_field(StructuredOutputsConfig, k)
+                    })
             else:
                 structured_outputs_instance = structured_outputs_config
         else:

From 94a2e74b0e7458c9ab315d7db59e7cdbce752061 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:20:11 +0200
Subject: [PATCH 33/43] Simplify request validation

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/openai/protocol.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index c12aec4fe164..9c402340bf98 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -814,15 +814,9 @@ def check_structured_outputs_count(cls, data):
             return data
 
         structured_outputs_kwargs = data['structured_outputs']
-
-        count = sum([
-            "json" in structured_outputs_kwargs
-            and structured_outputs_kwargs["json"] is not None,
-            "regex" in structured_outputs_kwargs
-            and structured_outputs_kwargs["regex"] is not None,
-            "choice" in structured_outputs_kwargs
-            and structured_outputs_kwargs["choice"] is not None
-        ])
+        count = sum(
+            structured_outputs_kwargs.get(k) is not None
+            for k in ("json", "regex", "choice"))
         # you can only use one kind of constraints for structured outputs
         if count > 1:
             raise ValueError(
@@ -1175,14 +1169,9 @@ def check_structured_outputs_count(cls, data):
             return data
 
         structured_outputs_kwargs = data['structured_outputs']
-        count = sum([
-            "json" in structured_outputs_kwargs
-            and structured_outputs_kwargs["json"] is not None,
-            "regex" in structured_outputs_kwargs
-            and structured_outputs_kwargs["regex"] is not None,
-            "choice" in structured_outputs_kwargs
-            and structured_outputs_kwargs["choice"] is not None
-        ])
+        count = sum(
+            structured_outputs_kwargs.get(k) is not None
+            for k in ("json", "regex", "choice"))
         if count > 1:
             raise ValueError(
                 "You can only use one kind of constraints for structured "

From 02ef1a7e72c1fb5144cfb9fd3be8d249f78756ee Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 19:20:10 +0200
Subject: [PATCH 34/43] Small typo

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/features/tool_calling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 720102ff9ea3..2a48596571d1 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -83,7 +83,7 @@ For more advanced usage, including parallel tool calls and different model-speci
 
 ## Named Function Calling
 
-vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backend supported by vLLM. You are guaranteed a validly-parsable function call - not a
+vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
 high-quality one.
 
 vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.

From a2b0c18cd4bb1021c2cbe071e9541f44a7e0e8c2 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 20:00:02 +0200
Subject: [PATCH 35/43] Fix type checking of `structured_outputs` in protocol

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/openai/protocol.py | 38 +++++++++++------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 9c402340bf98..f893e3449f6f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -519,7 +519,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
-    structured_outputs: Optional[dict[str, Any]] = Field(
+    structured_outputs: Optional[StructuredOutputsParams] = Field(
         default=None,
         description="Additional kwargs for structured outputs",
     )
@@ -640,28 +640,23 @@ def to_sampling_params(
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        structured_outputs = None
-        if (self.structured_outputs is not None
-                and any(v is not None
-                        for v in self.structured_outputs.values())):
-            structured_outputs = StructuredOutputsParams(
-                **self.structured_outputs)
-
+        if self.structured_outputs is not None:
             if self.response_format is not None:
                 if self.response_format.type == "json_object":
-                    structured_outputs.json_object = True
+                    self.structured_outputs.json_object = True
                 elif self.response_format.type == "json_schema":
                     json_schema = self.response_format.json_schema
                     assert json_schema is not None
-                    structured_outputs.json = json_schema.json_schema
+                    self.structured_outputs.json = json_schema.json_schema
                 elif self.response_format.type == "structural_tag":
                     structural_tag = self.response_format
                     assert structural_tag is not None and isinstance(
                         structural_tag, StructuralTagResponseFormat)
                     s_tag_obj = structural_tag.model_dump(by_alias=True)
-                    structured_outputs.structural_tag = json.dumps(s_tag_obj)
+                    self.structured_outputs.structural_tag = json.dumps(
+                        s_tag_obj)
             if structured_outputs_json := self._get_json_schema_from_tool():
-                structured_outputs.json = structured_outputs_json
+                self.structured_outputs.json = structured_outputs_json
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -693,9 +688,9 @@ def to_sampling_params(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
-            structured_outputs=structured_outputs,
+            structured_outputs=self.structured_outputs,
             logit_bias=self.logit_bias,
-            bad_words= self.bad_words,
+            bad_words=self.bad_words,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
         )
@@ -983,7 +978,7 @@ class CompletionRequest(OpenAIBaseModel):
             ", {'type': 'structural_tag'}, or {'type': 'text' } is supported."
         ),
     )
-    structured_outputs: Optional[dict[str, Any]] = Field(
+    structured_outputs: Optional[StructuredOutputsParams] = Field(
         default=None,
         description="Additional kwargs for structured outputs",
     )
@@ -1116,15 +1111,10 @@ def to_sampling_params(
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        structured_outputs = None
         if (self.structured_outputs is not None
-                and any(v is not None
-                        for v in self.structured_outputs.values())):
-            structured_outputs = StructuredOutputsParams(
-                **self.structured_outputs)
-            if (self.response_format is not None
-                    and self.response_format.type == "json_object"):
-                structured_outputs.json_object = True
+                and self.response_format is not None
+                and self.response_format.type == "json_object"):
+            self.structured_outputs.json_object = True
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -1156,7 +1146,7 @@ def to_sampling_params(
             truncate_prompt_tokens=self.truncate_prompt_tokens,
             output_kind=RequestOutputKind.DELTA if self.stream \
                 else RequestOutputKind.FINAL_ONLY,
-            structured_outputs=structured_outputs,
+            structured_outputs=self.structured_outputs,
             logit_bias=self.logit_bias,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,

From 176ecce61b5e10be62f356a41aef4fafe6ab1f73 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 22:41:31 +0200
Subject: [PATCH 36/43] Fix incorrect condition for enabling
 disable_any_whitespace in test

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index d0e0d4e77896..abc7973aee13 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -583,7 +583,7 @@ def test_structured_output_with_reasoning_matrices(
         max_num_seqs=16,
         structured_outputs_config=dict(backend=backend,
                                        disable_any_whitespace=backend
-                                       not in {"xgrammar", "guidance"},
+                                       in {"xgrammar", "guidance"},
                                        reasoning_parser=reasoning_parser),
         tokenizer_mode=tokenizer_mode,
         speculative_config=speculative_config,
@@ -763,7 +763,8 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
         max_model_len=1024,
         structured_outputs_config=StructuredOutputsConfig(
             backend=backend,
-            disable_any_whitespace=(backend in {"xgrammar", "guidance"})),
+            disable_any_whitespace=backend in {"xgrammar", "guidance"},
+        ),
     )
 
     structured_outputs_prompt = (

From 553d7a5d3c642911ef2bb00322e8ea54ba44d004 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 16 Sep 2025 23:30:44 +0200
Subject: [PATCH 37/43] Fix opinionated backend selection when `backend="auto"`

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/sampling_params.py               | 10 +++++++---
 vllm/v1/engine/processor.py           |  8 ++++++++
 vllm/v1/structured_output/__init__.py |  3 ++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 0100c7ccc646..cac7f72a72d3 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -2,12 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Sampling parameters for text generation."""
 import copy
-from dataclasses import dataclass
+from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
 from typing import Annotated, Any, Optional, Union
 
 import msgspec
+from pydantic.dataclasses import dataclass
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
@@ -28,19 +29,22 @@ class SamplingType(IntEnum):
 # maybe make msgspec?
 @dataclass
 class StructuredOutputsParams:
-    """One of these fields will be used to build a logit processor."""
+    # One of these fields will be used to build a logit processor.
     json: Optional[Union[str, dict]] = None
     regex: Optional[str] = None
     choice: Optional[list[str]] = None
     grammar: Optional[str] = None
     json_object: Optional[bool] = None
-    """These are other options that can be set"""
+    # These are other options that can be set.
     disable_fallback: bool = False
     disable_any_whitespace: bool = False
     disable_additional_properties: bool = False
     whitespace_pattern: Optional[str] = None
     structural_tag: Optional[str] = None
 
+    _backend: Optional[str] = field(default=None, init=False)
+    """CAUTION: Should only be set by Processor._validate_structured_output"""
+
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
         count = sum([
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 045470a81c0d..717a5ba64d37 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -216,6 +216,12 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             )
 
         backend = self.structured_outputs_config.backend
+        if params.structured_outputs._backend and backend != "auto":
+            raise ValueError(
+                "StructuredOutputsParams._backend should only be set here if "
+                "StructuredOutputsConfig.backend is 'auto'.")
+        else:
+            params.structured_outputs._backend = backend
 
         # Request content validation
         if (isinstance(params.structured_outputs.choice, list)
@@ -249,11 +255,13 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             # other setting where a specific backend was specified.
             try:
                 validate_xgrammar_grammar(params)
+                params.structured_outputs._backend = "xgrammar"
             except ValueError:
                 # The request either failed validation
                 # or includes some jsonschema feature(s) that
                 # are not supported in xgrammar. Fall back to guidance.
                 validate_guidance_grammar(params, tokenizer=None)
+                params.structured_outputs._backend = "guidance"
 
     def _maybe_build_mm_uuids(
         self,
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 8ac5ea4129f7..d2c09e2a1f9d 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -83,9 +83,10 @@ def grammar_init(self, request: Request) -> None:
         #
         # NOTE: We only support a single backend. We do NOT support different
         # backends on a per-request basis in V1 (for now, anyway...).
+        # _backend is set in Processor._validate_structured_output
         if self.backend is None:
             assert request.sampling_params is not None
-            backend = self.vllm_config.structured_outputs_config.backend
+            backend = request.sampling_params.structured_outputs._backend
             vocab_size = self.vllm_config.model_config.get_vocab_size()
             if backend == "xgrammar":
                 self.backend = XgrammarBackend(

From bd5ef9476b9cecc5c1303daa82aec1300eefc2bd Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 17 Sep 2025 00:25:27 +0200
Subject: [PATCH 38/43] Remove badly merged change

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/entrypoints/openai/test_chat.py | 55 +--------------------------
 1 file changed, 1 insertion(+), 54 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 08c5b37e683b..04876d29becb 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -12,7 +12,7 @@
 import regex as re
 import requests
 import torch
-from openai import BadRequestError, OpenAI
+from openai import BadRequestError
 
 from ...utils import RemoteOpenAIServer
 
@@ -975,59 +975,6 @@ async def test_long_seed(client: openai.AsyncOpenAI):
                 or "less_than_equal" in exc_info.value.message)
 
 
-@pytest.mark.asyncio
-async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
-    url = f"http://localhost:{server.port}/v1/chat/completions"
-    headers = {
-        "Content-Type": "application/json",
-    }
-    data = {
-        # model_name is avoided here.
-        "messages": [{
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }, {
-            "role": "user",
-            "content": "what is 1+1?"
-        }],
-        "max_tokens":
-        5
-    }
-
-    response = requests.post(url, headers=headers, json=data)
-    response_data = response.json()
-    print(response_data)
-    assert response_data.get("model") == MODEL_NAME
-    choice = response_data.get("choices")[0]
-    message = choice.get("message")
-    assert message is not None
-    content = message.get("content")
-    assert content is not None
-    assert len(content) > 0
-
-
-@pytest.mark.asyncio
-async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer):
-    openai_api_key = "EMPTY"
-    openai_api_base = f"http://localhost:{server.port}/v1"
-
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    messages = [
-        {
-            "role": "user",
-            "content": "Hello, vLLM!"
-        },
-    ]
-    response = client.chat.completions.create(
-        model="",  # empty string
-        messages=messages,
-    )
-    assert response.model == MODEL_NAME
-
-
 @pytest.mark.asyncio
 async def test_invocations(server: RemoteOpenAIServer,
                            client: openai.AsyncOpenAI):

From 8b38bc4aeca6e4a8d1043c172ba074769656d374 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 17 Sep 2025 00:43:27 +0200
Subject: [PATCH 39/43] Fix opinionated backend selection part 2

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/sampling_params.py     |  2 ++
 vllm/v1/engine/processor.py | 20 ++++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index cac7f72a72d3..0a01cb0260ae 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -44,6 +44,8 @@ class StructuredOutputsParams:
 
     _backend: Optional[str] = field(default=None, init=False)
     """CAUTION: Should only be set by Processor._validate_structured_output"""
+    _backend_was_auto: bool = field(default=False, init=False)
+    """CAUTION: Should only be set by Processor._validate_structured_output"""
 
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 717a5ba64d37..4766b3039f7d 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -216,10 +216,20 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             )
 
         backend = self.structured_outputs_config.backend
-        if params.structured_outputs._backend and backend != "auto":
-            raise ValueError(
-                "StructuredOutputsParams._backend should only be set here if "
-                "StructuredOutputsConfig.backend is 'auto'.")
+        if _backend := params.structured_outputs._backend:
+            # Request-level backend selection is not supported.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_auto` option set on the backend in the params.
+            if (backend != _backend
+                    and not (backend == "auto"
+                             and params.structured_outputs._backend_was_auto)):
+                raise ValueError(
+                    "Request-level structured output backend selection is not "
+                    f"supported. The request specified '{_backend}', but vLLM "
+                    f"was initialised with '{backend}'. This error can be "
+                    "resolved by removing '_backend' from the request.")
         else:
             params.structured_outputs._backend = backend
 
@@ -262,6 +272,8 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
                 # are not supported in xgrammar. Fall back to guidance.
                 validate_guidance_grammar(params, tokenizer=None)
                 params.structured_outputs._backend = "guidance"
+            # Remember that this backend was set automatically
+            params.structured_outputs._backend_was_auto = True
 
     def _maybe_build_mm_uuids(
         self,

From 76cb011f1446646e77e90ea5283d9e24099cebda Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 17 Sep 2025 00:51:50 +0200
Subject: [PATCH 40/43] Fix comment

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/v1/engine/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 4766b3039f7d..9c25e043c2d1 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -221,7 +221,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             # The values may differ if `params` is reused and was set
             # to a specific backend based on `auto` behavior in a previous
             # request. We remember that it was set as a result of `auto`
-            # using the `_auto` option set on the backend in the params.
+            # using the `_backend_was_auto` field set in the params.
             if (backend != _backend
                     and not (backend == "auto"
                              and params.structured_outputs._backend_was_auto)):

From f869f9caa8ded7761fb6208dd7ffa4f817588ae3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 17 Sep 2025 13:39:12 +0200
Subject: [PATCH 41/43] Make failing test less flaky

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/entrypoints/openai/test_chat.py | 60 +++++++++++----------------
 1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index cbd3731096f0..a827f94cfbfe 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -670,10 +670,23 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
     }, {
         "role":
         "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
+        "content": ("Give an example JSON for an employee "
+                    "profile using the specified tool.")
+    }]
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name",
+            "description": "This is a dummy function",
+            "parameters": sample_json_schema
+        }
     }]
+    tool_choice = {
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name"
+        }
+    }
 
     # non-streaming
 
@@ -681,20 +694,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": sample_json_schema
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
-            }
-        },
+        tools=tools,
+        tool_choice=tool_choice,
     )
     message = chat_completion.choices[0].message
     assert len(message.content) == 0
@@ -712,25 +713,12 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
 
     # streaming
 
-    stream = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_completion_tokens=1000,
-        tools=[{
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name",
-                "description": "This is a dummy function",
-                "parameters": sample_json_schema
-            }
-        }],
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "dummy_function_name"
-            }
-        },
-        stream=True)
+    stream = await client.chat.completions.create(model=MODEL_NAME,
+                                                  messages=messages,
+                                                  max_completion_tokens=1000,
+                                                  tools=tools,
+                                                  tool_choice=tool_choice,
+                                                  stream=True)
 
     output = []
     finish_reason_count = 0

From ec94b4a5099aa97ff3afa84170cd9f5ae7e5b804 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 17 Sep 2025 14:50:34 +0200
Subject: [PATCH 42/43] Fix structured output being enabled by response format
 and tool calling

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/openai/protocol.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index d0506a1040fe..9c5e5766db02 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -640,23 +640,32 @@ def to_sampling_params(
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        if self.structured_outputs is not None:
-            if self.response_format is not None:
-                if self.response_format.type == "json_object":
+        if ((response_format := self.response_format) is not None or
+            (tool_json := self._get_json_schema_from_tool()) is not None):
+            # If structured outputs wasn't already enabled,
+            # we must enable it for these features to work
+            if self.structured_outputs is None:
+                self.structured_outputs = StructuredOutputsParams()
+
+            # Set structured output params for response format
+            if response_format is not None:
+                if response_format.type == "json_object":
                     self.structured_outputs.json_object = True
-                elif self.response_format.type == "json_schema":
-                    json_schema = self.response_format.json_schema
+                elif response_format.type == "json_schema":
+                    json_schema = response_format.json_schema
                     assert json_schema is not None
                     self.structured_outputs.json = json_schema.json_schema
-                elif self.response_format.type == "structural_tag":
-                    structural_tag = self.response_format
+                elif response_format.type == "structural_tag":
+                    structural_tag = response_format
                     assert structural_tag is not None and isinstance(
                         structural_tag, StructuralTagResponseFormat)
                     s_tag_obj = structural_tag.model_dump(by_alias=True)
                     self.structured_outputs.structural_tag = json.dumps(
                         s_tag_obj)
-            if structured_outputs_json := self._get_json_schema_from_tool():
-                self.structured_outputs.json = structured_outputs_json
+
+            # Set structured output params for tool calling
+            if tool_json is not None:
+                self.structured_outputs.json = tool_json
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:

From 5872fe7e392eed1491073e0c569d61f54785e8a5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 18 Sep 2025 08:45:20 +0200
Subject: [PATCH 43/43] Fix test

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/entrypoints/openai/protocol.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2bea57dd653e..cff4a45fdc43 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -669,8 +669,9 @@ def to_sampling_params(
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        if ((response_format := self.response_format) is not None or
-            (tool_json := self._get_json_schema_from_tool()) is not None):
+        response_format = self.response_format
+        json_schema_from_tool = self._get_json_schema_from_tool()
+        if response_format is not None or json_schema_from_tool is not None:
             # If structured outputs wasn't already enabled,
             # we must enable it for these features to work
             if self.structured_outputs is None:
@@ -693,8 +694,8 @@ def to_sampling_params(
                         s_tag_obj)
 
             # Set structured output params for tool calling
-            if tool_json is not None:
-                self.structured_outputs.json = tool_json
+            if json_schema_from_tool is not None:
+                self.structured_outputs.json = json_schema_from_tool
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params: