From 69068cdd5b420ffb7e3926a6a4c6575efeea1b59 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Tue, 12 Aug 2025 20:43:22 -0400 Subject: [PATCH 01/43] chore: finalize cleanup from v0 Signed-off-by: Aaron Pham --- .../benchmark_serving_structured_output.py | 16 +- docs/features/reasoning_outputs.md | 12 +- docs/features/structured_outputs.md | 36 ++-- docs/features/tool_calling.md | 11 +- docs/serving/openai_compatible_server.md | 4 +- .../offline_inference/structured_outputs.py | 57 +++--- ...t_completion_client_with_tools_required.py | 2 +- .../structured_outputs/structured_outputs.py | 8 +- tests/async_engine/test_async_llm_engine.py | 1 - tests/entrypoints/conftest.py | 2 +- tests/entrypoints/llm/test_lazy_outlines.py | 82 -------- tests/entrypoints/openai/test_chat.py | 18 +- tests/entrypoints/openai/test_completion.py | 78 ++++--- tests/entrypoints/openai/test_serving_chat.py | 4 - tests/test_sampling_params.py | 84 -------- tests/tool_use/test_tool_choice_required.py | 11 +- tests/v1/core/test_scheduler.py | 6 +- tests/v1/engine/test_llm_engine.py | 4 +- tests/v1/entrypoints/conftest.py | 2 +- .../llm/test_struct_output_generate.py | 92 +++++---- vllm/config/__init__.py | 35 ++-- vllm/engine/arg_utils.py | 64 ++---- vllm/engine/async_llm_engine.py | 8 +- vllm/engine/llm_engine.py | 11 +- vllm/engine/multiprocessing/client.py | 6 +- vllm/engine/protocol.py | 7 +- vllm/entrypoints/llm.py | 19 +- vllm/entrypoints/openai/protocol.py | 192 +++++++----------- vllm/model_executor/models/config.py | 6 +- vllm/sampling_params.py | 28 +-- vllm/transformers_utils/tokenizers/mistral.py | 3 - vllm/v1/engine/async_llm.py | 3 - vllm/v1/engine/processor.py | 39 +--- vllm/v1/request.py | 6 +- vllm/v1/structured_output/__init__.py | 6 +- vllm/v1/structured_output/backend_guidance.py | 4 +- vllm/v1/structured_output/backend_outlines.py | 12 +- vllm/v1/structured_output/backend_xgrammar.py | 6 +- vllm/v1/structured_output/request.py | 2 +- 39 files changed, 357 insertions(+), 630 deletions(-) delete mode 100644 tests/entrypoints/llm/test_lazy_outlines.py delete mode 100644 tests/test_sampling_params.py diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index ca6843a72aa3..28821aa4ab73 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -696,11 +696,11 @@ def _eval_correctness_regex(expected, actual): return re.match(args.regex, actual) is not None def _eval_correctness(expected, actual): - if args.structure_type == "guided_json": + if args.structure_type == "json": return _eval_correctness_json(expected, actual) - elif args.structure_type == "guided_regex": + elif args.structure_type == "regex": return _eval_correctness_regex(expected, actual) - elif args.structure_type == "guided_choice": + elif args.structure_type == "choice": return _eval_correctness_choice(expected, actual) else: return None @@ -780,18 +780,18 @@ def main(args: argparse.Namespace): ) if args.dataset == "grammar": - args.structure_type = "guided_grammar" + args.structure_type = "grammar" elif args.dataset == "regex": - args.structure_type = "guided_regex" + args.structure_type = "regex" elif args.dataset == "choice": - args.structure_type = "guided_choice" + args.structure_type = "choice" else: - args.structure_type = "guided_json" + args.structure_type = "json" if args.no_structured_output: args.structured_output_ratio = 0 if args.save_results: - result_file_name = f"{args.structured_output_ratio}guided" + result_file_name = f"{args.structured_output_ratio}so" result_file_name += f"_{backend}" result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.model.split('/')[-1]}" diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 04b943efbbbb..3c66f4bd57df 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -1,3 +1,7 @@ +--- +title: reasoning_outputs +--- + # Reasoning Outputs vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. @@ -10,11 +14,11 @@ vLLM currently supports the following reasoning models: | Model Series | Parser Name | Structured Output Support | Tool Calling | |--------------|-------------|------------------|-------------| -| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ | -| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ | +| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ | +| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | -| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ | -| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ | +| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ | +| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ | !!! note IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 8a934d406f38..c99a54197421 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla The following parameters are supported, which must be added as extra parameters: -- `guided_choice`: the output will be exactly one of the choices. -- `guided_regex`: the output will follow the regex pattern. -- `guided_json`: the output will follow the JSON schema. -- `guided_grammar`: the output will follow the context free grammar. +- `choice`: the output will be exactly one of the choices. +- `regex`: the output will follow the regex pattern. +- `json`: the output will follow the JSON schema. +- `grammar`: the output will follow the context free grammar. - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text. You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page. Structured outputs are supported by default in the OpenAI-Compatible Server. You may choose to specify the backend to use by setting the -`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`, +`--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`, which will try to choose an appropriate backend based on the details of the request. You may also choose a specific backend, along with some options. A full set of options is available in the `vllm serve --help` text. -Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: +Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one: ??? code @@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic messages=[ {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} ], - extra_body={"guided_choice": ["positive", "negative"]}, + extra_body={"structured_outputs": {"choices": ["positive", "negative"]}}, ) print(completion.choices[0].message.content) ``` -The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: +The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template: ??? code @@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", } ], - extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, + extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]}, ) print(completion.choices[0].message.content) ``` One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. -For this we can use the `guided_json` parameter in two different ways: +For this we can use the `json` parameter in two different ways: - Using directly a [JSON Schema](https://json-schema.org/) - Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option). -The next example shows how to use the `guided_json` parameter with a Pydantic model: +The next example shows how to use the `response_format` parameter with a Pydantic model: ??? code @@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo JSON schema and how the fields should be populated. This can improve the results notably in most cases. -Finally we have the `guided_grammar` option, which is probably the most +Finally we have the `grammar` option, which is probably the most difficult to use, but it´s really powerful. It allows us to define complete languages like SQL queries. It works by using a context free EBNF grammar. As an example, we can use to define a specific format of simplified SQL queries: @@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries: "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", } ], - extra_body={"guided_grammar": simplified_sql_grammar}, + extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}}, ) print(completion.choices[0].message.content) ``` @@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: int: top_p=0.95, n=n, seed=seed, - guided_decoding=GuidedDecodingParams( + structured_outputs=StructuredOutputsParams( regex="[0-9]+") if structured_outputs else None, ) for n in n_list ], n_list diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py index ffe061212466..08d50e3fc928 100644 --- a/tests/v1/entrypoints/conftest.py +++ b/tests/v1/entrypoints/conftest.py @@ -151,7 +151,7 @@ def sample_definition_json_schema(): @pytest.fixture -def sample_guided_choice(): +def sample_choices(): return [ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", "Swift", "Kotlin" diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 8bddfb0b48a5..3e6fdc6ee3e5 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -18,7 +18,7 @@ from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager -from vllm.sampling_params import GuidedDecodingParams, SamplingParams +from vllm.sampling_params import SamplingParams, StructuredOutputsParams if TYPE_CHECKING: from vllm.config import TokenizerMode @@ -85,7 +85,7 @@ def _load_json(s: str, backend: str) -> str: @pytest.mark.skip_global_cleanup @pytest.mark.parametrize( - "model_name, guided_decoding_backend, tokenizer_mode, speculative_config", + "model_name, backend, tokenizer_mode, speculative_config", PARAMS_MODELS_BACKENDS_TOKENIZER_MODE) def test_structured_output( monkeypatch: pytest.MonkeyPatch, @@ -94,8 +94,8 @@ def test_structured_output( sample_sql_ebnf: str, sample_sql_lark: str, sample_regex: str, - sample_guided_choice: str, - guided_decoding_backend: str, + sample_choices: str, + backend: str, tokenizer_mode: str, model_name: str, speculative_config: dict[str, Any], @@ -110,15 +110,13 @@ def test_structured_output( enforce_eager = bool(not current_platform.is_tpu()) # Use a single LLM instance for several scenarios to # speed up the test suite. - llm = LLM( - model=model_name, - enforce_eager=enforce_eager, - max_model_len=1024, - guided_decoding_backend=guided_decoding_backend, - guided_decoding_disable_any_whitespace=(guided_decoding_backend - in {"xgrammar", "guidance"}), - tokenizer_mode=tokenizer_mode, - speculative_config=speculative_config) + llm = LLM(model=model_name, + enforce_eager=enforce_eager, + max_model_len=1024, + structured_outputs_config=dict( + disable_any_whitespace=backend in {"xgrammar", "guidance"}), + tokenizer_mode=tokenizer_mode, + speculative_config=speculative_config) # # Test 1: Generate JSON output based on a provided schema @@ -126,7 +124,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=sample_json_schema)) + structured_outputs=StructuredOutputsParams(json=sample_json_schema)) outputs = llm.generate(prompts=[ (f"Give an example JSON for an employee profile that fits this " f"schema. Make the response as short as possible. Schema: " @@ -152,12 +150,12 @@ def test_structured_output( # # Test 2: Generate JSON object without a schema # - if guided_decoding_backend != "outlines": + if backend != "outlines": sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, n=2, - guided_decoding=GuidedDecodingParams(json_object=True)) + structured_outputs=StructuredOutputsParams(json_object=True)) outputs = llm.generate(prompts=( "Generate a JSON object with curly braces for a person with " @@ -186,8 +184,9 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=unsupported_json_schema)) - if guided_decoding_backend.startswith("xgrammar"): + structured_outputs=StructuredOutputsParams( + json=unsupported_json_schema)) + if backend.startswith("xgrammar"): with pytest.raises(ValueError, match="The provided JSON schema contains features " "not supported by xgrammar."): @@ -217,7 +216,7 @@ def test_structured_output( parsed_json = json.loads(generated_text) assert isinstance(parsed_json, dict) - if guided_decoding_backend != "outlines": + if backend != "outlines": # # Test 4: Generate SQL statement using EBNF grammar # @@ -225,7 +224,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf)) + structured_outputs=StructuredOutputsParams( + grammar=sample_sql_ebnf)) outputs = llm.generate( prompts=( "Generate a sql statement that selects col_1 from " @@ -259,7 +259,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark)) + structured_outputs=StructuredOutputsParams( + grammar=sample_sql_lark)) outputs = llm.generate( prompts=( "Generate a sql statement that selects col_1 from " @@ -298,7 +299,8 @@ def test_structured_output( temperature=0.8, top_p=0.95, max_tokens=1000, - guided_decoding=GuidedDecodingParams(grammar="not a grammar")) + structured_outputs=StructuredOutputsParams( + grammar="not a grammar")) with pytest.raises(ValueError, match="Failed to convert the grammar "): llm.generate( prompts= @@ -315,7 +317,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.8, top_p=0.95, - guided_decoding=GuidedDecodingParams(regex=sample_regex)) + structured_outputs=StructuredOutputsParams(regex=sample_regex)) outputs = llm.generate( prompts=[ (f"Give an example IPv4 address with this regex: {sample_regex}. " @@ -342,7 +344,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.8, top_p=0.95, - guided_decoding=GuidedDecodingParams(choice=sample_guided_choice)) + structured_outputs=StructuredOutputsParams(choice=sample_choices)) outputs = llm.generate( prompts=("The best language for type-safe systems programming is " "(Make the response as short as possible.) "), @@ -356,7 +358,7 @@ def test_structured_output( generated_text = output.outputs[0].text print(generated_text) assert generated_text is not None - assert generated_text in sample_guided_choice + assert generated_text in sample_choices print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") # @@ -366,7 +368,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=1000, - guided_decoding=GuidedDecodingParams(json=json_schema)) + structured_outputs=StructuredOutputsParams(json=json_schema)) outputs = llm.generate(prompts=( "Generate a JSON with the brand, model and car_type of the most " "iconic car from the 90's. Make the response as short as " @@ -408,7 +410,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=1.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams(json=json_schema)) + structured_outputs=StructuredOutputsParams(json=json_schema)) outputs = llm.generate( prompts=("Generate a description of a frog using 50 characters. " @@ -429,7 +431,7 @@ def test_structured_output( output_json = json.loads(generated_text) jsonschema.validate(instance=output_json, schema=json_schema) - if guided_decoding_backend != "outlines": + if backend != "outlines": # # Test 11: Generate structured output using structural_tag format # @@ -455,7 +457,7 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.0, max_tokens=4096, - guided_decoding=GuidedDecodingParams( + structured_outputs=StructuredOutputsParams( structural_tag=json.dumps(structural_tag_config))) prompt = """ @@ -532,7 +534,7 @@ def test_structured_output( @pytest.mark.skip_global_cleanup @pytest.mark.parametrize( - "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 + "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 [ ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto", "deepseek_r1", NGRAM_SPEC_CONFIG), @@ -541,7 +543,7 @@ def test_structured_output( ) def test_structured_output_with_reasoning_matrices( monkeypatch: pytest.MonkeyPatch, - guided_decoding_backend: str, + backend: str, tokenizer_mode: TokenizerMode, reasoning_parser: str, model_name: str, @@ -561,10 +563,10 @@ def test_structured_output_with_reasoning_matrices( enforce_eager=bool(not current_platform.is_tpu()), max_model_len=1024, max_num_seqs=16, - guided_decoding_backend=guided_decoding_backend, - guided_decoding_disable_any_whitespace=True, + backend=backend, + structured_outputs_config=dict(disable_any_whitespace=True, + reasoning_backend=reasoning_parser), tokenizer_mode=tokenizer_mode, - reasoning_parser=reasoning_parser, speculative_config=speculative_config, ) tokenizer = llm.get_tokenizer(None) @@ -588,7 +590,7 @@ def test_structured_output_with_reasoning_matrices( sampling_params = SamplingParams( temperature=0.1, max_tokens=8192, - guided_decoding=GuidedDecodingParams(json=reasoning_schema), + structured_outputs=StructuredOutputsParams(json=reasoning_schema), ) outputs = llm.generate( [reasoning_prompt], @@ -625,13 +627,14 @@ def test_structured_output_auto_mode( llm = LLM(model=model_name, max_model_len=1024, - guided_decoding_backend="auto", + backend="auto", tokenizer_mode=tokenizer_mode) sampling_params = SamplingParams( temperature=1.0, max_tokens=1000, - guided_decoding=GuidedDecodingParams(json=unsupported_json_schema)) + structured_outputs=StructuredOutputsParams( + json=unsupported_json_schema)) prompts = ( "Give an example JSON object for a grade " @@ -668,9 +671,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct", max_model_len=1024, - guided_decoding_backend="guidance", - guided_decoding_disable_any_whitespace=True, - guided_decoding_disable_additional_properties=True) + structured_outputs_config=dict( + disable_any_whitespace=True, + disable_additional_properties=True)) schema = { 'type': 'object', @@ -696,14 +699,15 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): "<|im_end|>\n<|im_start|>assistant\n") def generate_with_backend(backend): - guided_params = GuidedDecodingParams( + structured_outputs_params = StructuredOutputsParams( json=schema, backend=backend, disable_any_whitespace=True, disable_additional_properties=True) - sampling_params = SamplingParams(temperature=0, - max_tokens=256, - guided_decoding=guided_params) + sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + structured_outputs=structured_outputs_params) outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) assert outputs is not None diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index df4eb33f5d45..11c810386ee2 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -2566,24 +2566,24 @@ class PoolerConfig: ## for embeddings models normalize: Optional[bool] = None """ - Whether to normalize the embeddings outputs. + Whether to normalize the embeddings outputs. """ dimensions: Optional[int] = None """ - Reduce the dimensions of embeddings if model + Reduce the dimensions of embeddings if model support matryoshka representation. """ ## for classification models activation: Optional[bool] = None """ - Whether to apply activation function to the classification outputs. + Whether to apply activation function to the classification outputs. """ ## for reward models softmax: Optional[bool] = None """ - Whether to apply softmax to the reward outputs. + Whether to apply softmax to the reward outputs. """ step_tag_id: Optional[int] = None """ @@ -2946,26 +2946,26 @@ def get_served_model_name(model: str, return served_model_name -GuidedDecodingBackend = Literal["auto", "xgrammar", "guidance", "outlines"] +StructuredOutputsBackend = Literal["auto", "xgrammar", "guidance", "outlines"] @config @dataclass -class DecodingConfig: - """Dataclass which contains the decoding strategy of the engine.""" +class StructuredOutputsConfig: + """Dataclass which contains structured outputs config for the engine.""" - backend: GuidedDecodingBackend = "auto" - """Which engine will be used for guided decoding (JSON schema / regex etc) + backend: StructuredOutputsBackend = "auto" + """Which engine will be used for structured outputs (JSON schema / regex etc) by default. With "auto", we will make opinionated choices based on request contents and what the backend libraries currently support, so the behavior - is subject to change in each release.""" + is subject to change in each release.""" # noqa: E501 disable_fallback: bool = False """If `True`, vLLM will not fallback to a different backend on error.""" disable_any_whitespace: bool = False - """If `True`, the model will not generate any whitespace during guided - decoding. This is only supported for xgrammar and guidance backends.""" + """If `True`, the model will not generate any whitespace during structured + outputs. This is only supported for xgrammar and guidance backends.""" disable_additional_properties: bool = False """If `True`, the `guidance` backend will not use `additionalProperties` @@ -3262,8 +3262,9 @@ class VllmConfig: """LoRA configuration.""" speculative_config: Optional[SpeculativeConfig] = None """Speculative decoding configuration.""" - decoding_config: DecodingConfig = field(default_factory=DecodingConfig) - """Decoding configuration.""" + structured_outputs_config: StructuredOutputsConfig = field( + default_factory=StructuredOutputsConfig) + """Structured outputs configuration.""" observability_config: Optional[ObservabilityConfig] = None """Observability configuration.""" quant_config: Optional[QuantizationConfig] = None @@ -3354,8 +3355,8 @@ def compute_hash(self) -> str: vllm_factors.append(self.speculative_config.compute_hash()) else: vllm_factors.append("None") - if self.decoding_config: - vllm_factors.append(self.decoding_config.compute_hash()) + if self.structured_outputs_config: + vllm_factors.append(self.structured_outputs_config.compute_hash()) else: vllm_factors.append("None") if self.observability_config: @@ -3775,7 +3776,7 @@ def __str__(self): f"enforce_eager={self.model_config.enforce_eager}, " f"kv_cache_dtype={self.cache_config.cache_dtype}, " f"device_config={self.device_config.device}, " - f"decoding_config={self.decoding_config!r}, " + f"decoding_config={self.structured_outputs_config!r}, " f"observability_config={self.observability_config!r}, " f"seed={self.model_config.seed}, " f"served_model_name={self.model_config.served_model_name}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d74db67bda0d..32911d29f55c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -23,21 +23,20 @@ import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, ConfigFormat, ConfigType, ConvertOption, - DecodingConfig, DetailedTraceModules, Device, - DeviceConfig, DistributedExecutorBackend, - GuidedDecodingBackend, HfOverrides, KVEventsConfig, - KVTransferConfig, LoadConfig, LogprobsMode, - LoRAConfig, ModelConfig, ModelDType, ModelImpl, - MultiModalConfig, ObservabilityConfig, ParallelConfig, - PoolerConfig, PrefixCachingHashAlgo, RunnerOption, - SchedulerConfig, SchedulerPolicy, SpeculativeConfig, + DetailedTraceModules, Device, DeviceConfig, + DistributedExecutorBackend, HfOverrides, + KVEventsConfig, KVTransferConfig, LoadConfig, + LogprobsMode, LoRAConfig, ModelConfig, ModelDType, + ModelImpl, MultiModalConfig, ObservabilityConfig, + ParallelConfig, PoolerConfig, PrefixCachingHashAlgo, + RunnerOption, SchedulerConfig, SchedulerPolicy, + SpeculativeConfig, StructuredOutputsConfig, TaskOption, TokenizerMode, VllmConfig, get_attr_docs, get_field) from vllm.logger import init_logger from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_ray_initialized -from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.config import is_interleaved from vllm.transformers_utils.utils import check_gguf_file @@ -382,12 +381,9 @@ class EngineArgs: disable_hybrid_kv_cache_manager: bool = ( SchedulerConfig.disable_hybrid_kv_cache_manager) - guided_decoding_backend: GuidedDecodingBackend = DecodingConfig.backend - guided_decoding_disable_fallback: bool = DecodingConfig.disable_fallback - guided_decoding_disable_any_whitespace: bool = \ - DecodingConfig.disable_any_whitespace - guided_decoding_disable_additional_properties: bool = \ - DecodingConfig.disable_additional_properties + structured_outputs_config: StructuredOutputsConfig = get_field( + VllmConfig, "structured_outputs_config") + logits_processor_pattern: Optional[ str] = ModelConfig.logits_processor_pattern @@ -426,7 +422,6 @@ class EngineArgs: additional_config: dict[str, Any] = \ get_field(VllmConfig, "additional_config") - reasoning_parser: str = DecodingConfig.reasoning_backend use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location @@ -567,29 +562,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: load_group.add_argument('--pt-load-map-location', **load_kwargs["pt_load_map_location"]) - # Guided decoding arguments - guided_decoding_kwargs = get_kwargs(DecodingConfig) - guided_decoding_group = parser.add_argument_group( - title="DecodingConfig", - description=DecodingConfig.__doc__, - ) - guided_decoding_group.add_argument("--guided-decoding-backend", - **guided_decoding_kwargs["backend"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-fallback", - **guided_decoding_kwargs["disable_fallback"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-any-whitespace", - **guided_decoding_kwargs["disable_any_whitespace"]) - guided_decoding_group.add_argument( - "--guided-decoding-disable-additional-properties", - **guided_decoding_kwargs["disable_additional_properties"]) - guided_decoding_group.add_argument( - "--reasoning-parser", - # This choices is a special case because it's not static - choices=list(ReasoningParserManager.reasoning_parsers), - **guided_decoding_kwargs["reasoning_backend"]) - # Parallel arguments parallel_kwargs = get_kwargs(ParallelConfig) parallel_group = parser.add_argument_group( @@ -840,6 +812,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **vllm_kwargs["compilation_config"]) vllm_group.add_argument("--additional-config", **vllm_kwargs["additional_config"]) + vllm_group.add_argument('--structured-outputs-config', + **vllm_kwargs["structured_outputs_config"]) # Other arguments parser.add_argument('--disable-log-stats', @@ -1328,14 +1302,8 @@ def create_engine_config( load_config = self.create_load_config() - decoding_config = DecodingConfig( - backend=self.guided_decoding_backend, - disable_fallback=self.guided_decoding_disable_fallback, - disable_any_whitespace=self.guided_decoding_disable_any_whitespace, - disable_additional_properties=\ - self.guided_decoding_disable_additional_properties, - reasoning_backend=self.reasoning_parser - ) + structured_outputs_config = StructuredOutputsConfig( + **self.structured_outputs) observability_config = ObservabilityConfig( show_hidden_metrics_for_version=( @@ -1353,7 +1321,7 @@ def create_engine_config( lora_config=lora_config, speculative_config=speculative_config, load_config=load_config, - decoding_config=decoding_config, + structured_outputs_config=structured_outputs_config, observability_config=observability_config, compilation_config=self.compilation_config, kv_transfer_config=self.kv_transfer_config, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1f962b008ee0..851962920abe 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -10,8 +10,8 @@ from weakref import ReferenceType import vllm.envs as envs -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VllmConfig) +from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, + SchedulerConfig, VllmConfig) from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout @@ -1063,10 +1063,6 @@ async def get_parallel_config(self) -> ParallelConfig: """Get the parallel configuration of the vLLM engine.""" return self.engine.get_parallel_config() - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" - return self.engine.get_decoding_config() - async def get_scheduler_config(self) -> SchedulerConfig: """Get the scheduling configuration of the vLLM engine.""" return self.engine.get_scheduler_config() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3fc4f6445df2..f04ec035030d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -16,9 +16,8 @@ from typing_extensions import TypeVar import vllm.envs as envs -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, SchedulerConfig, - VllmConfig) +from vllm.config import (LoRAConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, SchedulerConfig, VllmConfig) from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase, Stats @@ -217,8 +216,6 @@ def __init__( self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config # noqa self.load_config = vllm_config.load_config - self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa - ) self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa ) @@ -814,10 +811,6 @@ def get_parallel_config(self) -> ParallelConfig: """Gets the parallel configuration.""" return self.parallel_config - def get_decoding_config(self) -> DecodingConfig: - """Gets the decoding configuration.""" - return self.decoding_config - def get_scheduler_config(self) -> SchedulerConfig: """Gets the scheduler configuration.""" return self.scheduler_config diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index f69f72edf6a5..7c3679507686 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -16,7 +16,7 @@ from zmq.asyncio import Socket from vllm import PoolingParams -from vllm.config import DecodingConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs # yapf conflicts with isort for this block # yapf: disable @@ -93,7 +93,6 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, # Get the configs. self.vllm_config = engine_config self.model_config = engine_config.model_config - self.decoding_config = engine_config.decoding_config if self.vllm_config.model_config.skip_tokenizer_init: self.tokenizer = None @@ -386,9 +385,6 @@ async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): async def get_vllm_config(self) -> VllmConfig: return self.vllm_config - async def get_decoding_config(self) -> DecodingConfig: - return self.decoding_config - async def get_model_config(self) -> ModelConfig: return self.model_config diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 671e9648a3d0..5984244dd9c0 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -6,7 +6,7 @@ from typing import AsyncGenerator, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function -from vllm.config import DecodingConfig, ModelConfig, VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.parse import is_explicit_encoder_decoder_prompt @@ -247,11 +247,6 @@ async def get_model_config(self) -> ModelConfig: """Get the model configuration of the vLLM engine.""" ... - @abstractmethod - async def get_decoding_config(self) -> DecodingConfig: - """Get the decoding configuration of the vLLM engine.""" - ... - @abstractmethod async def get_input_preprocessor(self) -> InputPreprocessor: """Get the input processor of the vLLM engine.""" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 915f14a29b90..a942532200c6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -17,8 +17,8 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, create_sort_beams_key_function) -from vllm.config import (CompilationConfig, ModelDType, TokenizerMode, - is_init_field) +from vllm.config import (CompilationConfig, ModelDType, + StructuredOutputsConfig, TokenizerMode, is_init_field) from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides, PoolerConfig, RunnerOption) from vllm.engine.llm_engine import LLMEngine @@ -196,6 +196,8 @@ def __init__( hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, override_pooler_config: Optional[PoolerConfig] = None, + structured_outputs_config: Optional[Union[dict[ + str, Any], StructuredOutputsConfig]] = None, compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, **kwargs, @@ -245,6 +247,18 @@ def __init__( else: compilation_config_instance = CompilationConfig() + if structured_outputs_config is not None: + if isinstance(structured_outputs_config, dict): + predicate = lambda x: is_init_field(StructuredOutputsConfig, x[ + 0]) + structured_outputs_instance = StructuredOutputsConfig(**dict( + filter( + predicate, + structured_outputs_config.items(), + ))) + else: + structured_outputs_instance = structured_outputs_config + engine_args = EngineArgs( model=model, runner=runner, @@ -271,6 +285,7 @@ def __init__( hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, override_pooler_config=override_pooler_config, + structured_outputs_config=structured_outputs_instance, compilation_config=compilation_config_instance, **kwargs, ) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 543701ed144e..abae05386d8a 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -35,8 +35,8 @@ ScoreMultiModalParam) from vllm.logger import init_logger from vllm.pooling_params import PoolingParams -from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, - RequestOutputKind, SamplingParams) +from vllm.sampling_params import (BeamSearchParams, RequestOutputKind, + SamplingParams, StructuredOutputsParams) from vllm.sequence import Logprob from vllm.utils import random_uuid, resolve_obj_by_qualname @@ -335,11 +335,11 @@ def to_sampling_params( stop_token_ids = default_sampling_params.get("stop_token_ids") # Structured output - guided_decoding = None + structured_outputs = None if self.text is not None and self.text.format is not None: response_format = self.text.format if response_format.type == "json_schema": - guided_decoding = GuidedDecodingParams.from_optional( + structured_outputs = StructuredOutputsParams.from_optional( json=response_format.schema_) elif response_format.type == "json_object": raise NotImplementedError("json_object is not supported") @@ -353,7 +353,7 @@ def to_sampling_params( stop_token_ids=stop_token_ids, output_kind=(RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY), - guided_decoding=guided_decoding, + structured_outputs=structured_outputs, ) @model_validator(mode="before") @@ -500,42 +500,9 @@ class ChatCompletionRequest(OpenAIBaseModel): default=None, description=("Additional kwargs to pass to the HF processor."), ) - guided_json: Optional[Union[str, dict, BaseModel]] = Field( + structured_outputs: Optional[dict[str, Any]] = Field( default=None, - description=("If specified, the output will follow the JSON schema."), - ) - guided_regex: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the regex pattern."), - ) - guided_choice: Optional[list[str]] = Field( - default=None, - description=( - "If specified, the output will be exactly one of the choices."), - ) - guided_grammar: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the context free grammar."), - ) - structural_tag: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the structural tag schema."), - ) - guided_decoding_backend: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default guided decoding backend " - "of the server for this specific request. If set, must be either " - "'outlines' / 'lm-format-enforcer'"), - ) - guided_whitespace_pattern: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default whitespace pattern " - "for guided json decoding."), + description="Additional kwargs for structured outputs", ) priority: int = Field( default=0, @@ -646,30 +613,29 @@ def to_sampling_params( if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - guided_json_object = None + structured_outputs = StructuredOutputsParams(**self.structured_outputs) if self.response_format is not None: if self.response_format.type == "json_object": - guided_json_object = True + structured_outputs.json_object = True elif self.response_format.type == "json_schema": json_schema = self.response_format.json_schema assert json_schema is not None - self.guided_json = json_schema.json_schema + structured_outputs.json = json_schema.json_schema elif self.response_format.type == "structural_tag": structural_tag = self.response_format assert structural_tag is not None and isinstance( structural_tag, StructuralTagResponseFormat) s_tag_obj = structural_tag.model_dump(by_alias=True) - self.structural_tag = json.dumps(s_tag_obj) - - guided_decoding = GuidedDecodingParams.from_optional( - json=self._get_guided_json_from_tool() or self.guided_json, - regex=self.guided_regex, - choice=self.guided_choice, - grammar=self.guided_grammar, - json_object=guided_json_object, - backend=self.guided_decoding_backend, - whitespace_pattern=self.guided_whitespace_pattern, - structural_tag=self.structural_tag, + structured_outputs.structural_tag = json.dumps(s_tag_obj) + + structured_outputs = StructuredOutputsParams.from_optional( + json=self._get_json_schema_from_tool() or structured_outputs.json, + regex=structured_outputs.regex, + choice=structured_outputs.choice, + grammar=structured_outputs.grammar, + json_object=structured_outputs.json_object, + whitespace_pattern=structured_outputs.whitespace_pattern, + structural_tag=structured_outputs.structural_tag, ) extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} @@ -702,14 +668,14 @@ def to_sampling_params( truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, - guided_decoding=guided_decoding, + structured_outputs=structured_outputs, logit_bias=self.logit_bias, bad_words= self.bad_words, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, ) - def _get_guided_json_from_tool( + def _get_json_schema_from_tool( self) -> Optional[Union[str, dict, BaseModel]]: # user has chosen to not use any tool if self.tool_choice == "none" or self.tools is None: @@ -816,28 +782,37 @@ def check_logprobs(cls, data): @model_validator(mode="before") @classmethod - def check_guided_decoding_count(cls, data): + def check_structured_outputs_count(cls, data): if isinstance(data, ValueError): raise data - guide_count = sum([ - "guided_json" in data and data["guided_json"] is not None, - "guided_regex" in data and data["guided_regex"] is not None, - "guided_choice" in data and data["guided_choice"] is not None + if "structured_outputs" not in data: + return data + + structured_outputs_kwargs = data['structured_outputs'] + + count = sum([ + "json" in structured_outputs_kwargs + and structured_outputs_kwargs["json"] is not None, + "regex" in structured_outputs_kwargs + and structured_outputs_kwargs["regex"] is not None, + "choice" in structured_outputs_kwargs + and structured_outputs_kwargs["choice"] is not None ]) - # you can only use one kind of guided decoding - if guide_count > 1: + # you can only use one kind of constraints for structured outputs + if count > 1: raise ValueError( - "You can only use one kind of guided decoding " - "('guided_json', 'guided_regex' or 'guided_choice').") - # you can only either use guided decoding or tools, not both - if guide_count > 1 and data.get("tool_choice", "none") not in ( + "You can only use one kind of constraints for structured outputs ('json', 'regex' or 'choice')." # noqa: E501 + ) + # you can only either use structured outputs or tools, not both + if count > 1 and data.get("tool_choice", "none") not in ( "none", "auto", "required", ): raise ValueError( - "You can only either use guided decoding or tools, not both.") + "You can only either use constraints for structured outputs or tools, not both." # noqa: E501 + ) return data @model_validator(mode="before") @@ -990,37 +965,9 @@ class CompletionRequest(OpenAIBaseModel): ", {'type': 'structural_tag'}, or {'type': 'text' } is supported." ), ) - guided_json: Optional[Union[str, dict, BaseModel]] = Field( - default=None, - description="If specified, the output will follow the JSON schema.", - ) - guided_regex: Optional[str] = Field( - default=None, - description=( - "If specified, the output will follow the regex pattern."), - ) - guided_choice: Optional[list[str]] = Field( - default=None, - description=( - "If specified, the output will be exactly one of the choices."), - ) - guided_grammar: Optional[str] = Field( + structured_outputs: Optional[dict[str, Any]] = Field( default=None, - description=( - "If specified, the output will follow the context free grammar."), - ) - guided_decoding_backend: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default guided decoding backend " - "of the server for this specific request. If set, must be one of " - "'outlines' / 'lm-format-enforcer'"), - ) - guided_whitespace_pattern: Optional[str] = Field( - default=None, - description=( - "If specified, will override the default whitespace pattern " - "for guided json decoding."), + description="Additional kwargs for structured outputs", ) priority: int = Field( default=0, @@ -1143,19 +1090,19 @@ def to_sampling_params( echo_without_generation = self.echo and self.max_tokens == 0 - guided_json_object = None + structured_outputs_kwargs = StructuredOutputsParams( + **self.structured_outputs) if (self.response_format is not None and self.response_format.type == "json_object"): - guided_json_object = True - - guided_decoding = GuidedDecodingParams.from_optional( - json=self.guided_json, - regex=self.guided_regex, - choice=self.guided_choice, - grammar=self.guided_grammar, - json_object=guided_json_object, - backend=self.guided_decoding_backend, - whitespace_pattern=self.guided_whitespace_pattern, + structured_outputs_kwargs.json_object = True + + structured_outputs = StructuredOutputsParams.from_optional( + json=structured_outputs_kwargs.json, + regex=structured_outputs_kwargs.regex, + choice=structured_outputs_kwargs.choice, + grammar=structured_outputs_kwargs.grammar, + json_object=structured_outputs_kwargs.json_object, + whitespace_pattern=structured_outputs_kwargs.whitespace_pattern, ) extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} @@ -1188,7 +1135,7 @@ def to_sampling_params( truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, - guided_decoding=guided_decoding, + structured_outputs=structured_outputs, logit_bias=self.logit_bias, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, @@ -1196,16 +1143,23 @@ def to_sampling_params( @model_validator(mode="before") @classmethod - def check_guided_decoding_count(cls, data): - guide_count = sum([ - "guided_json" in data and data["guided_json"] is not None, - "guided_regex" in data and data["guided_regex"] is not None, - "guided_choice" in data and data["guided_choice"] is not None + def check_structured_outputs_count(cls, data): + if "structured_outputs" not in data: + return data + + structured_outputs_kwargs = data['structured_outputs'] + count = sum([ + "json" in structured_outputs_kwargs + and structured_outputs_kwargs["json"] is not None, + "regex" in structured_outputs_kwargs + and structured_outputs_kwargs["regex"] is not None, + "choice" in structured_outputs_kwargs + and structured_outputs_kwargs["choice"] is not None ]) - if guide_count > 1: + if count > 1: raise ValueError( - "You can only use one kind of guided decoding " - "('guided_json', 'guided_regex' or 'guided_choice').") + "You can only use one kind of constraints for structured outputs ('json', 'regex' or 'choice')." # noqa: E501 + ) return data @model_validator(mode="before") @@ -1991,7 +1945,7 @@ class DetokenizeResponse(OpenAIBaseModel): class TokenizerInfoResponse(OpenAIBaseModel): """ - Response containing tokenizer configuration + Response containing tokenizer configuration equivalent to tokenizer_config.json """ diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 6f21cd267b0e..4bfe9094edff 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -251,9 +251,9 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: - decoding_config = vllm_config.decoding_config - if decoding_config.reasoning_backend == "": - decoding_config.reasoning_backend = "GptOss" + structured_outputs_config = vllm_config.structured_outputs_config + if structured_outputs_config.reasoning_backend == "": + structured_outputs_config.reasoning_backend = "GptOss" # Increase the max capture size from 512 to 1024 for performance. # NOTE(woosuk): This will increase the number of CUDA graphs diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index df4cca9ba114..632bf05372a9 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -28,7 +28,7 @@ class SamplingType(IntEnum): # maybe make msgspec? @dataclass -class GuidedDecodingParams: +class StructuredOutputsParams: """One of these fields will be used to build a logit processor.""" json: Optional[Union[str, dict]] = None regex: Optional[str] = None @@ -36,8 +36,6 @@ class GuidedDecodingParams: grammar: Optional[str] = None json_object: Optional[bool] = None """These are other options that can be set""" - backend: Optional[str] = None - backend_was_auto: bool = False disable_fallback: bool = False disable_any_whitespace: bool = False disable_additional_properties: bool = False @@ -51,37 +49,35 @@ def from_optional( choice: Optional[list[str]] = None, grammar: Optional[str] = None, json_object: Optional[bool] = None, - backend: Optional[str] = None, whitespace_pattern: Optional[str] = None, structural_tag: Optional[str] = None, - ) -> Optional["GuidedDecodingParams"]: + ) -> Optional["StructuredOutputsParams"]: if all(arg is None for arg in (json, regex, choice, grammar, json_object, structural_tag)): return None # Extract json schemas from pydantic models if isinstance(json, (BaseModel, type(BaseModel))): json = json.model_json_schema() - return GuidedDecodingParams( + return StructuredOutputsParams( json=json, regex=regex, choice=choice, grammar=grammar, json_object=json_object, - backend=backend, whitespace_pattern=whitespace_pattern, structural_tag=structural_tag, ) def __post_init__(self): """Validate that some fields are mutually exclusive.""" - guide_count = sum([ + count = sum([ self.json is not None, self.regex is not None, self.choice is not None, self.grammar is not None, self.json_object is not None ]) - if guide_count > 1: + if count > 1: raise ValueError( - "You can only use one kind of guided decoding but multiple are " - f"specified: {self.__dict__}") + f"You can only use one kind of structured outputs constraint but multiple are specified: {self.__dict__}" # noqa: E501 + ) class RequestOutputKind(Enum): @@ -194,9 +190,7 @@ class SamplingParams( _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) # Fields used to construct logits processors - guided_decoding: Optional[GuidedDecodingParams] = None - """If provided, the engine will construct a guided decoding logits - processor from these parameters.""" + structured_outputs: Optional[StructuredOutputsParams] = None logit_bias: Optional[dict[int, float]] = None """If provided, the engine will construct a logits processor that applies these logit biases.""" @@ -243,7 +237,7 @@ def from_optional( truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, - guided_decoding: Optional[GuidedDecodingParams] = None, + structured_outputs: Optional[StructuredOutputsParams] = None, logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None, allowed_token_ids: Optional[list[int]] = None, extra_args: Optional[dict[str, Any]] = None, @@ -285,7 +279,7 @@ def from_optional( logits_processors=logits_processors, truncate_prompt_tokens=truncate_prompt_tokens, output_kind=output_kind, - guided_decoding=guided_decoding, + structured_outputs=structured_outputs, logit_bias=logit_bias, allowed_token_ids=allowed_token_ids, extra_args=extra_args, @@ -552,7 +546,7 @@ def __repr__(self) -> str: "spaces_between_special_tokens=" f"{self.spaces_between_special_tokens}, " f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " - f"guided_decoding={self.guided_decoding}, " + f"structured_outputs={self.structured_outputs}, " f"extra_args={self.extra_args})") diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 6ccc636efaf1..99237fb96567 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -463,9 +463,6 @@ def _token_to_id(t: str): return decoded - # WARN: Outlines logits processors can overwrite this method. - # See: guided_decoding/outlines_logits_processors.py::_adapt_tokenizer - # for more. def decode(self, ids: Union[list[int], int], skip_special_tokens: bool = True) -> str: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a2706327914c..78629a13dc35 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -526,9 +526,6 @@ async def get_vllm_config(self) -> VllmConfig: async def get_model_config(self) -> ModelConfig: return self.model_config - async def get_decoding_config(self): - raise ValueError("Not Supported on V1 yet.") - async def get_input_preprocessor(self) -> InputPreprocessor: return self.processor.input_preprocessor diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index b9419142caf6..138773e41966 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -41,7 +41,7 @@ def __init__( self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config - self.decoding_config = vllm_config.decoding_config + self.structured_outputs_config = vllm_config.structured_outputs_config self.tokenizer = tokenizer self.generation_config_fields = ( @@ -154,40 +154,23 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None: "not enabled!") def _validate_structured_output(self, params: SamplingParams) -> None: - if not params.guided_decoding or not self.decoding_config: + if not params.structured_outputs or not self.structured_outputs_config: return - if self.model_config.skip_tokenizer_init and params.guided_decoding: + if self.model_config.skip_tokenizer_init and params.structured_outputs: raise ValueError( "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 ) - engine_level_backend = self.decoding_config.backend - if params.guided_decoding.backend: - # Request-level backend selection is not supported in V1. - # The values may differ if `params` is reused and was set - # to a specific backend based on `auto` behavior in a previous - # request. We remember that it was set as a result of `auto` - # using the `_auto` option set on the backend in the params. - if (params.guided_decoding.backend != engine_level_backend - and not (engine_level_backend == "auto" - and params.guided_decoding.backend_was_auto)): - raise ValueError( - "Request-level structured output backend selection is no " - "longer supported. The request specified " - f"'{params.guided_decoding.backend}', but vLLM was " - f"initialised with '{engine_level_backend}'. This error " - "can be resolved by removing backend selection from the " - "request.") - else: - params.guided_decoding.backend = engine_level_backend + engine_level_backend = self.structured_outputs_config.backend # Request content validation - if (isinstance(params.guided_decoding.choice, list) - and not params.guided_decoding.choice): + if (isinstance(params.structured_outputs.choice, list) + and not params.structured_outputs.choice): # It is invalid for choice to be an empty list - raise ValueError(f"Choice '{params.guided_decoding.choice}' " - "cannot be an empty list") + raise ValueError( + f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501 + ) if engine_level_backend.startswith("xgrammar"): # xgrammar with no fallback @@ -210,15 +193,11 @@ def _validate_structured_output(self, params: SamplingParams) -> None: # between releases as feature support changes. try: validate_xgrammar_grammar(params) - params.guided_decoding.backend = "xgrammar" except ValueError: # The request either failed validation # or includes some jsonschema feature(s) that # are not supported in xgrammar. Fall back to guidance. validate_guidance_grammar(params, tokenizer=None) - params.guided_decoding.backend = "guidance" - # Remember that this backend was set automatically - params.guided_decoding.backend_was_auto = True def process_inputs( self, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 85f5dcb92eb4..3f08c02bea24 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -50,7 +50,7 @@ def __init__( time.time() self.status = RequestStatus.WAITING - if sampling_params and sampling_params.guided_decoding is not None: + if sampling_params and sampling_params.structured_outputs is not None: self.status = RequestStatus.WAITING_FOR_FSM self.events: list[EngineCoreEvent] = [] self.stop_reason: Union[int, str, None] = None @@ -63,7 +63,7 @@ def __init__( elif sampling_params is not None: assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - if sampling_params.guided_decoding is not None: + if sampling_params.structured_outputs is not None: self.status = RequestStatus.WAITING_FOR_FSM if sampling_params.extra_args is not None: @@ -175,7 +175,7 @@ def get_num_encoder_tokens(self, input_id: int) -> int: @property def use_structured_output(self) -> bool: return self.sampling_params is not None and \ - self.sampling_params.guided_decoding is not None + self.sampling_params.structured_outputs is not None def record_event( self, diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 63604a335d9f..4dccd1fe46bf 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -65,7 +65,7 @@ def __init__(self, vllm_config: VllmConfig): lora_config=self.vllm_config.lora_config, ).get_lora_tokenizer(None) reasoning_backend = \ - self.vllm_config.decoding_config.reasoning_backend + self.vllm_config.structured_outputs_config.reasoning_backend if reasoning_backend: reasoner_cls = ReasoningParserManager.get_reasoning_parser( reasoning_backend) @@ -77,7 +77,7 @@ def grammar_init(self, request: Request) -> None: if TYPE_CHECKING: assert request.sampling_params is not None and \ - request.sampling_params.guided_decoding is not None + request.sampling_params.structured_outputs is not None # Initialize the backend the first time it is needed. # @@ -85,7 +85,7 @@ def grammar_init(self, request: Request) -> None: # backends on a per-request basis in V1 (for now, anyway...). if self.backend is None: assert request.sampling_params is not None - backend = request.sampling_params.guided_decoding.backend + backend = self.vllm_config.structured_outputs_config.backend vocab_size = self.vllm_config.model_config.get_vocab_size() if backend == "xgrammar": self.backend = XgrammarBackend( diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 02e7fc33f517..e06ab6377de3 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -60,9 +60,9 @@ class GuidanceBackend(StructuredOutputBackend): def __post_init__(self): self.disable_any_whitespace = \ - self.vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.structured_outputs_config.disable_any_whitespace self.disable_additional_properties = \ - self.vllm_config.decoding_config.disable_additional_properties + self.vllm_config.structured_outputs_config.disable_additional_properties self.ll_tokenizer = llguidance_hf.from_tokenizer( self.tokenizer, self.vocab_size) diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py index 572e4984480f..4ea859b305dc 100644 --- a/vllm/v1/structured_output/backend_outlines.py +++ b/vllm/v1/structured_output/backend_outlines.py @@ -158,10 +158,10 @@ def reset(self): def validate_structured_output_request_outlines(params: SamplingParams): - if params.guided_decoding is None: + if params.structured_outputs is None: return - gd_params = params.guided_decoding + gd_params = params.structured_outputs if gd_params.regex: validate_regex_is_buildable(gd_params.regex) @@ -178,7 +178,7 @@ def validate_structured_output_request_outlines(params: SamplingParams): schema = json.dumps(gd_params.json) except Exception as e: raise ValueError( - f"Error serializing guided decoding jsonschema: {e}" + f"Error serializing structured outputs jsonschema: {e}" ) from e pattern = json_schema.build_regex_from_schema(schema) validate_regex_is_buildable(pattern) @@ -187,7 +187,7 @@ def validate_structured_output_request_outlines(params: SamplingParams): regex = "(" + "|".join(choices) + ")" validate_regex_is_buildable(regex) elif gd_params.grammar: - raise ValueError("Outlines guided decoding backend " + raise ValueError("Outlines structured outputs backend " "does not support grammar specifications") @@ -306,7 +306,7 @@ def validate_regex_is_buildable(pattern: str) -> None: _check_unsupported(parsed) except ValueError as e: raise ValueError( - f"Regex uses unsupported feature for guided decoding: {e}. " + f"Regex uses unsupported feature for structured outputs: {e}. " "Only basic matching constructs are supported—lookarounds, " "backreferences, and unicode boundaries are not.") from e @@ -315,6 +315,6 @@ def validate_regex_is_buildable(pattern: str) -> None: "Regex does not have a anchored universal start state" "This means that the Regex uses anchors (^) or look-arounds " "in a way which requires context before any token is matched." - "Guided decoding needs regexes that can match without needing " + "structured outputs needs regexes that can match without needing " "that context. Try rewriting the pattern without using these " f"constructs. Pattern:\n{pattern}") diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 5e00f6380416..edea1fd5fc8e 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -34,7 +34,7 @@ class XgrammarBackend(StructuredOutputBackend): def __post_init__(self): self.disable_any_whitespace = \ - self.vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.structured_outputs_config.disable_any_whitespace if isinstance(self.tokenizer, MistralTokenizer): # NOTE: ideally, xgrammar should handle this accordingly. @@ -248,10 +248,10 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: Raises ValueError if the request is not supported. """ - if sampling_params.guided_decoding is None: + if sampling_params.structured_outputs is None: return - gd_params = sampling_params.guided_decoding + gd_params = sampling_params.structured_outputs if gd_params.regex: try: diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index fc365f12573f..99974ef46ecd 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -60,7 +60,7 @@ def structured_output_key(self) -> StructuredOutputKey: def get_structured_output_key( sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.guided_decoding + params = sampling_params.structured_outputs assert params is not None, "params can't be None." if params.json is not None: if not isinstance(params.json, str): From 47ef968e5f21ccc4858840a02403f495985ac37e Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Tue, 12 Aug 2025 20:48:20 -0400 Subject: [PATCH 02/43] fix: remove unecessary frontmatter Signed-off-by: Aaron Pham --- docs/features/reasoning_outputs.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 3c66f4bd57df..c0a1c784686b 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -1,7 +1,3 @@ ---- -title: reasoning_outputs ---- - # Reasoning Outputs vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. From f5d594c919f9f8afe94b0242c04f5109bc87b1d5 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Tue, 12 Aug 2025 21:11:24 -0400 Subject: [PATCH 03/43] fix: tests to use correct CLI args Signed-off-by: Aaron Pham --- .../entrypoints/openai/test_completion_with_function_calling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index a5b081f86107..b2c3386b320b 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -21,7 +21,7 @@ def server(): # noqa: F811 "--dtype", "half", "--enable-auto-tool-choice", - "--guided-decoding-backend", + "--structured-outputs-config.backend", "xgrammar", "--tool-call-parser", "hermes", From bb884cdc8c9d041aca2b1f06a4a36a84ebfc4470 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:07:16 +0200 Subject: [PATCH 04/43] Sweep for `guided_{choice/regex/json/grammar}` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/features/structured_outputs.md | 2 +- docs/serving/openai_compatible_server.md | 2 +- tests/entrypoints/openai/test_chat.py | 28 ++++++++++--------- .../entrypoints/openai/test_openai_schema.py | 8 ++++-- .../openai/test_chat_completion.py | 14 ++++++++-- .../v1/entrypoints/openai/test_completion.py | 14 ++++++++-- 6 files changed, 44 insertions(+), 24 deletions(-) diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 60da6cc9a7d1..1f955c6e30d6 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -45,7 +45,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as messages=[ {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} ], - extra_body={"structured_outputs": {"choices": ["positive", "negative"]}}, + extra_body={"structured_outputs": {"choice": ["positive", "negative"]}}, ) print(completion.choices[0].message.content) ``` diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index ec4a1a7004a3..56eb3c515c86 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -133,7 +133,7 @@ completion = client.chat.completions.create( {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} ], extra_body={ - "structured_outputs": {"choices": ["positive", "negative"]} + "structured_outputs": {"choice": ["positive", "negative"]} } ) ``` diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 2ccb0beb7709..4cb1eda87e0f 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -505,7 +505,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(guided_choice=sample_choices)) + extra_body=dict(structured_outputs={"choice": sample_choices})) choice1 = chat_completion.choices[0].message.content assert choice1 in sample_choices @@ -519,7 +519,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(guided_choice=sample_choices)) + extra_body=dict(structured_outputs={"choice": sample_choices})) choice2 = chat_completion.choices[0].message.content assert choice2 in sample_choices assert choice1 != choice2 @@ -545,7 +545,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - extra_body=dict(guided_json=sample_json_schema)) + extra_body=dict(structured_outputs={"json": sample_json_schema})) message = chat_completion.choices[0].message assert message.content is not None json1 = json.loads(message.content) @@ -562,7 +562,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - extra_body=dict(guided_json=sample_json_schema)) + extra_body=dict(structured_outputs={"json": sample_json_schema})) message = chat_completion.choices[0].message assert message.content is not None json2 = json.loads(message.content) @@ -590,7 +590,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, model=MODEL_NAME, messages=messages, max_completion_tokens=20, - extra_body=dict(guided_regex=sample_regex)) + extra_body=dict(structured_outputs={"regex": sample_regex})) ip1 = chat_completion.choices[0].message.content assert ip1 is not None assert re.fullmatch(sample_regex, ip1) is not None @@ -601,7 +601,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, model=MODEL_NAME, messages=messages, max_completion_tokens=20, - extra_body=dict(guided_regex=sample_regex)) + extra_body=dict(structured_outputs={"regex": sample_regex})) ip2 = chat_completion.choices[0].message.content assert ip2 is not None assert re.fullmatch(sample_regex, ip2) is not None @@ -621,12 +621,14 @@ async def test_structured_outputs_type_error(client: openai.AsyncOpenAI): }] with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create(model=MODEL_NAME, - messages=messages, - extra_body=dict(guided_regex={ - 1: "Python", - 2: "C++" - })) + _ = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + extra_body=dict( + structured_outputs={"regex": { + 1: "Python", + 2: "C++" + }})) @pytest.mark.asyncio @@ -648,7 +650,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, max_completion_tokens=10, logprobs=True, top_logprobs=5, - extra_body=dict(guided_choice=sample_choices)) + extra_body=dict(structured_outputs={"choice": sample_choices})) assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs.content is not None diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index 11ed1c4a9ee4..3787c1001f9a 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -102,12 +102,14 @@ def no_invalid_types(case: schemathesis.models.Case): if "custom" in tool_call: return False - # Sometimes guided_grammar is generated to be empty + # Sometimes structured_outputs.grammar is generated to be empty # Causing a server error in EBNF grammar parsing # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421 - guided_grammar = case.body.get("guided_grammar") + structured_outputs = case.body.get("structured_outputs", {}) + g = structured_outputs.get("grammar") if isinstance( + structured_outputs, dict) else None - if guided_grammar == '': + if g == '': # Allow None (will be handled as no grammar) # But skip empty strings return False diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py index dffb32846c05..9aa285aa9b18 100644 --- a/tests/v1/entrypoints/openai/test_chat_completion.py +++ b/tests/v1/entrypoints/openai/test_chat_completion.py @@ -77,7 +77,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI, "role": "user", "content": prompt, }], - extra_body={"guided_json": invalid_json_schema}, + extra_body={"structured_outputs": { + "json": invalid_json_schema + }}, ) @@ -99,7 +101,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str): "content": prompt, }], extra_body={ - "guided_regex": r"[.*", + "structured_outputs": { + "regex": r"[.*" + }, "stop": ["\n"] }, ) @@ -134,5 +138,9 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str): "role": "user", "content": prompt, }], - extra_body={"guided_grammar": invalid_simplified_sql_grammar}, + extra_body={ + "structured_outputs": { + "grammar": invalid_simplified_sql_grammar + } + }, ) diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 3a65583fab8d..afbda20a14c9 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -627,7 +627,9 @@ async def test_invalid_json_schema(client: openai.AsyncOpenAI, await client.completions.create( model=model_name, prompt=prompt, - extra_body={"guided_json": invalid_json_schema}, + extra_body={"structured_outputs": { + "json": invalid_json_schema + }}, ) @@ -646,7 +648,9 @@ async def test_invalid_regex(client: openai.AsyncOpenAI, model_name: str): model=model_name, prompt=prompt, extra_body={ - "guided_regex": r"[.*", + "structured_outputs": { + "regex": r"[.*" + }, "stop": ["\n"] }, ) @@ -678,7 +682,11 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str): await client.completions.create( model=model_name, prompt=prompt, - extra_body={"guided_grammar": invalid_simplified_sql_grammar}, + extra_body={ + "structured_outputs": { + "grammar": invalid_simplified_sql_grammar + } + }, ) From f2cd9e09fea9ebe728c56de1d9d4dedd36d75dea Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:08:34 +0200 Subject: [PATCH 05/43] `gd_params` -> `so_params` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../backend_lm_format_enforcer.py | 16 ++++----- vllm/v1/structured_output/backend_outlines.py | 22 ++++++------ vllm/v1/structured_output/backend_xgrammar.py | 34 +++++++++---------- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py index 2279a1c8c8a0..dbc2a59332ef 100644 --- a/vllm/v1/structured_output/backend_lm_format_enforcer.py +++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py @@ -141,27 +141,27 @@ def validate_structured_output_request_lm_format_enforcer( if params.guided_decoding is None: return - gd_params = params.guided_decoding + so_params = params.guided_decoding - if gd_params.regex: + if so_params.regex: return - elif gd_params.json: - if isinstance(gd_params.json, str): + elif so_params.json: + if isinstance(so_params.json, str): try: # make sure schema is valid json - json.loads(gd_params.json) + json.loads(so_params.json) except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: try: - json.dumps(gd_params.json) + json.dumps(so_params.json) except Exception as e: raise ValueError( f"Error serializing guided decoding jsonschema: {e}" ) from e return - elif gd_params.choice: + elif so_params.choice: return - elif gd_params.grammar: + elif so_params.grammar: raise ValueError("LM Format Enforcer guided decoding backend " "does not support grammar specifications") diff --git a/vllm/v1/structured_output/backend_outlines.py b/vllm/v1/structured_output/backend_outlines.py index 4ea859b305dc..e5e638a6ad76 100644 --- a/vllm/v1/structured_output/backend_outlines.py +++ b/vllm/v1/structured_output/backend_outlines.py @@ -161,32 +161,32 @@ def validate_structured_output_request_outlines(params: SamplingParams): if params.structured_outputs is None: return - gd_params = params.structured_outputs + so_params = params.structured_outputs - if gd_params.regex: - validate_regex_is_buildable(gd_params.regex) - elif gd_params.json: - if isinstance(gd_params.json, str): + if so_params.regex: + validate_regex_is_buildable(so_params.regex) + elif so_params.json: + if isinstance(so_params.json, str): try: # make sure schema is valid json - json.loads(gd_params.json) - schema = gd_params.json + json.loads(so_params.json) + schema = so_params.json except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: try: - schema = json.dumps(gd_params.json) + schema = json.dumps(so_params.json) except Exception as e: raise ValueError( f"Error serializing structured outputs jsonschema: {e}" ) from e pattern = json_schema.build_regex_from_schema(schema) validate_regex_is_buildable(pattern) - elif gd_params.choice: - choices = [regex_escape(str(choice)) for choice in gd_params.choice] + elif so_params.choice: + choices = [regex_escape(str(choice)) for choice in so_params.choice] regex = "(" + "|".join(choices) + ")" validate_regex_is_buildable(regex) - elif gd_params.grammar: + elif so_params.grammar: raise ValueError("Outlines structured outputs backend " "does not support grammar specifications") diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index edea1fd5fc8e..55b4792fe010 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -251,34 +251,34 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: if sampling_params.structured_outputs is None: return - gd_params = sampling_params.structured_outputs + so_params = sampling_params.structured_outputs - if gd_params.regex: + if so_params.regex: try: - xgr.Grammar.from_regex(gd_params.regex) + xgr.Grammar.from_regex(so_params.regex) except Exception as err: raise ValueError("Failed to transform regex into a grammar: " f"{err}") from err - if gd_params.choice: - choice_grammar = choice_as_grammar(gd_params.choice) + if so_params.choice: + choice_grammar = choice_as_grammar(so_params.choice) try: xgr.Grammar.from_ebnf(choice_grammar) except Exception as err: raise ValueError("Failed to transform choices into a grammar: " "{err}") from err - gd_params.choice = None - gd_params.grammar = choice_grammar + so_params.choice = None + so_params.grammar = choice_grammar return - if gd_params.json: - if isinstance(gd_params.json, str): + if so_params.json: + if isinstance(so_params.json, str): try: - schema = json.loads(gd_params.json) + schema = json.loads(so_params.json) except json.JSONDecodeError as e: raise ValueError("Invalid JSON grammar specification.") from e else: - schema = gd_params.json + schema = so_params.json try: xgr.Grammar.from_json_schema(schema) @@ -291,11 +291,11 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: "supported by xgrammar.") return - if gd_params.grammar: - if grammar_is_likely_lark(gd_params.grammar): + if so_params.grammar: + if grammar_is_likely_lark(so_params.grammar): # xgrammar supports EBNF grammars only try: - gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar) + so_params.grammar = convert_lark_to_ebnf(so_params.grammar) except ValueError as e: raise ValueError( "Failed to convert the grammar from Lark to EBNF. ") from e @@ -303,14 +303,14 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: # Test parsing EBNF grammar, possibly already converted from Lark try: # parse the grammar, but we aren't compiling it. - xgr.Grammar.from_ebnf(gd_params.grammar) + xgr.Grammar.from_ebnf(so_params.grammar) except Exception as e: raise ValueError("Invalid grammar specification.") from e return - if gd_params.structural_tag: + if so_params.structural_tag: try: - s_tag = json.loads(gd_params.structural_tag) + s_tag = json.loads(so_params.structural_tag) tags = [ xgr.StructuralTagItem( begin=s["begin"], From 8f37583a8780346cc87a37015940ccfc9d8b9f43 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:11:17 +0200 Subject: [PATCH 06/43] `g` -> `grammar` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/openai/test_openai_schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index 3787c1001f9a..73f79ac28d11 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -106,10 +106,10 @@ def no_invalid_types(case: schemathesis.models.Case): # Causing a server error in EBNF grammar parsing # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421 structured_outputs = case.body.get("structured_outputs", {}) - g = structured_outputs.get("grammar") if isinstance( + grammar = structured_outputs.get("grammar") if isinstance( structured_outputs, dict) else None - if g == '': + if grammar == '': # Allow None (will be handled as no grammar) # But skip empty strings return False From 6e972b81ba1985a0d64a99539e6c01b4972aadf7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:16:13 +0200 Subject: [PATCH 07/43] Fix `config/__init__.py` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 4fae2e2ef3a7..f89e0985bc8a 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3064,10 +3064,10 @@ class StructuredOutputsConfig: """Dataclass which contains structured outputs config for the engine.""" backend: StructuredOutputsBackend = "auto" - """Which engine will be used for structured outputs (JSON schema / regex etc) - by default. With "auto", we will make opinionated choices based on request - contents and what the backend libraries currently support, so the behavior - is subject to change in each release.""" # noqa: E501 + """Which engine will be used for structured outputs (e.g. JSON schema, + regex, etc) by default. With "auto", we will make opinionated choices + based on request contents and what the backend libraries currently support, + so the behavior is subject to change in each release.""" disable_fallback: bool = False """If `True`, vLLM will not fallback to a different backend on error.""" @@ -3915,7 +3915,7 @@ def __str__(self): f"enforce_eager={self.model_config.enforce_eager}, " f"kv_cache_dtype={self.cache_config.cache_dtype}, " f"device_config={self.device_config.device}, " - f"decoding_config={self.structured_outputs_config!r}, " + f"structured_outputs_config={self.structured_outputs_config!r}, " f"observability_config={self.observability_config!r}, " f"seed={self.model_config.seed}, " f"served_model_name={self.model_config.served_model_name}, " From 8810d842c750540912e3af1a226221e4a8cde7e0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:19:25 +0200 Subject: [PATCH 08/43] `engine_level_backend` -> `backend` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/v1/engine/processor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index ace04f17cc91..5906cd601591 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -207,7 +207,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None: "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 ) - engine_level_backend = self.structured_outputs_config.backend + backend = self.structured_outputs_config.backend # Request content validation if (isinstance(params.structured_outputs.choice, list) @@ -217,23 +217,23 @@ def _validate_structured_output(self, params: SamplingParams) -> None: f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501 ) - if engine_level_backend.startswith("xgrammar"): + if backend.startswith("xgrammar"): # xgrammar with no fallback validate_xgrammar_grammar(params) - elif engine_level_backend.startswith("guidance"): + elif backend.startswith("guidance"): # TODO: ideally we would have the LLTokenizer here as Lark syntax # allows <|special_token|> and similar, see # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens # Without tokenizer these are disallowed in grammars. validate_guidance_grammar(params, tokenizer=None) - elif engine_level_backend == "outlines": + elif backend == "outlines": # outlines backend validate_structured_output_request_outlines(params) - elif engine_level_backend == "lm-format-enforcer": + elif backend == "lm-format-enforcer": # lm format enforcer backend validate_structured_output_request_lm_format_enforcer(params) else: - # NOTE: engine_level_backend must be "auto" here, because we have + # NOTE: backend must be "auto" here, because we have # checked supported_backends above. # "auto" is an opt-in to opinionated behavior where we try to # choose a backend based on request contents. This is not the From 03962b2cefb6be39cda8cd2ff6a2d5159d3f7de5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:20:41 +0200 Subject: [PATCH 09/43] `guided_decoding` -> `structured_outputs` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/v1/structured_output/backend_lm_format_enforcer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py index dbc2a59332ef..5f9925d209fe 100644 --- a/vllm/v1/structured_output/backend_lm_format_enforcer.py +++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py @@ -138,10 +138,10 @@ def destroy(self): def validate_structured_output_request_lm_format_enforcer( params: SamplingParams): - if params.guided_decoding is None: + if params.structured_outputs is None: return - so_params = params.guided_decoding + so_params = params.structured_outputs if so_params.regex: return From ea8673513b5ac838f1d28a1d389a8a124718847e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:32:33 +0200 Subject: [PATCH 10/43] Fix `protocol.py` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/openai/protocol.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b7b617dff503..b3f49a2c0517 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -639,7 +639,8 @@ def to_sampling_params( if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - structured_outputs = StructuredOutputsParams(**self.structured_outputs) + structured_outputs = StructuredOutputsParams( + **(self.structured_outputs or {})) if self.response_format is not None: if self.response_format.type == "json_object": structured_outputs.json_object = True @@ -828,8 +829,8 @@ def check_structured_outputs_count(cls, data): # you can only use one kind of constraints for structured outputs if count > 1: raise ValueError( - "You can only use one kind of constraints for structured outputs ('json', 'regex' or 'choice')." # noqa: E501 - ) + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice').") # you can only either use structured outputs or tools, not both if count > 1 and data.get("tool_choice", "none") not in ( "none", @@ -837,8 +838,8 @@ def check_structured_outputs_count(cls, data): "required", ): raise ValueError( - "You can only either use constraints for structured outputs or tools, not both." # noqa: E501 - ) + "You can only either use constraints for structured outputs " + "or tools, not both.") return data @model_validator(mode="before") @@ -1125,7 +1126,7 @@ def to_sampling_params( echo_without_generation = self.echo and self.max_tokens == 0 structured_outputs_kwargs = StructuredOutputsParams( - **self.structured_outputs) + **(self.structured_outputs or {})) if (self.response_format is not None and self.response_format.type == "json_object"): structured_outputs_kwargs.json_object = True @@ -1192,8 +1193,8 @@ def check_structured_outputs_count(cls, data): ]) if count > 1: raise ValueError( - "You can only use one kind of constraints for structured outputs ('json', 'regex' or 'choice')." # noqa: E501 - ) + "You can only use one kind of constraints for structured " + "outputs ('json', 'regex' or 'choice').") return data @model_validator(mode="before") From fc0ce57f3c166d3b04974e97b894347f20bd574f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 1 Sep 2025 17:34:53 +0200 Subject: [PATCH 11/43] Missing docstring Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/sampling_params.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index e4a79572390a..398eb93d3ec3 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -76,8 +76,8 @@ def __post_init__(self): ]) if count > 1: raise ValueError( - f"You can only use one kind of structured outputs constraint but multiple are specified: {self.__dict__}" # noqa: E501 - ) + "You can only use one kind of structured outputs constraint " + f"but multiple are specified: {self.__dict__}") class RequestOutputKind(Enum): @@ -192,6 +192,7 @@ class SamplingParams( # Fields used to construct logits processors structured_outputs: Optional[StructuredOutputsParams] = None + """Parameters for configuring structured outputs.""" logit_bias: Optional[dict[int, float]] = None """If provided, the engine will construct a logits processor that applies these logit biases.""" From 36772c62e2c91e9dc7383f4ebe06231a06c508ec Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:29:14 +0200 Subject: [PATCH 12/43] Add missing backend selection Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 01fb5eb635ab..db5684ffaa3f 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -119,8 +119,9 @@ def test_structured_output( llm = LLM(model=model_name, enforce_eager=enforce_eager, max_model_len=1024, - structured_outputs_config=dict( - disable_any_whitespace=backend in {"xgrammar", "guidance"}), + structured_outputs_config=dict(backend=backend, + disable_any_whitespace=backend + in {"xgrammar", "guidance"}), tokenizer_mode=tokenizer_mode, speculative_config=speculative_config) From 6ac63c65628d858e3efa8b8d35aadfe57dd5bbc8 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:34:57 +0200 Subject: [PATCH 13/43] Remove last references to `guided_` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/scripts/hardware_ci/run-amd-test.sh | 6 ------ .github/mergify.yml | 1 - tests/entrypoints/openai/test_chat.py | 18 ++++++++++-------- .../llm/test_struct_output_generate.py | 2 +- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index c395011a2448..7f90181048d0 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then --ignore=entrypoints/llm/test_prompt_validation.py "} fi -#Obsolete currently -##ignore certain Entrypoints/llm tests -#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then -# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} -#fi - # --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_oot_registration.py diff --git a/.github/mergify.yml b/.github/mergify.yml index 495d207d4426..cc27947a4f0e 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -164,7 +164,6 @@ pull_request_rules: - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files~=^tests/v1/structured_output/ - - files=tests/v1/entrypoints/llm/test_guided_generate.py - files~=^vllm/v1/structured_output/ actions: label: diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 4cb1eda87e0f..a01263b94955 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -487,8 +487,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices, - is_v1_server: bool): +async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI, + sample_choices, + is_v1_server: bool): if not is_v1_server: pytest.skip("Guided decoding is only supported in v1 engine") messages = [{ @@ -526,8 +527,9 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, sample_choices, @pytest.mark.asyncio -async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, - is_v1_server: bool): +async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, + sample_json_schema, + is_v1_server: bool): if not is_v1_server: pytest.skip("Guided decoding is only supported in v1 engine") @@ -572,8 +574,8 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, @pytest.mark.asyncio -async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, - is_v1_server: bool): +async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI, + sample_regex, is_v1_server: bool): if not is_v1_server: pytest.skip("Guided decoding is only supported in v1 engine") @@ -632,8 +634,8 @@ async def test_structured_outputs_type_error(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, - sample_choices): +async def test_structured_outputs_choice_chat_logprobs( + client: openai.AsyncOpenAI, sample_choices): messages = [{ "role": "system", diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index db5684ffaa3f..69ce5c008c82 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -742,7 +742,7 @@ def generate_with_backend(backend): @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) -def test_structured_output_batched_with_non_guided_requests( +def test_structured_output_batched_with_non_structured_outputs_requests( monkeypatch: pytest.MonkeyPatch, sample_json_schema: dict[str, Any], backend: str, From 17c574daf779e28d47cf808b9510df31c4cbd0a5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:41:04 +0200 Subject: [PATCH 14/43] Fix arg utils Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 11034d700113..486013dabe98 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1341,9 +1341,6 @@ def create_engine_config( load_config = self.create_load_config() - structured_outputs_config = StructuredOutputsConfig( - **self.structured_outputs) - observability_config = ObservabilityConfig( show_hidden_metrics_for_version=( self.show_hidden_metrics_for_version), @@ -1360,7 +1357,7 @@ def create_engine_config( lora_config=lora_config, speculative_config=speculative_config, load_config=load_config, - structured_outputs_config=structured_outputs_config, + structured_outputs_config=self.structured_outputs_config, observability_config=observability_config, compilation_config=self.compilation_config, kv_transfer_config=self.kv_transfer_config, From b0c2916b07bb200ed5cfe82d7d3f8ede84e87194 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:52:52 +0200 Subject: [PATCH 15/43] `reasoning_backend` -> `reasoning_parser`, fix `args.reasoning_parser` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../v1/entrypoints/llm/test_struct_output_generate.py | 2 +- vllm/config/__init__.py | 2 +- vllm/entrypoints/openai/api_server.py | 10 +++++----- vllm/model_executor/models/config.py | 4 ++-- vllm/v1/structured_output/__init__.py | 8 ++++---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 69ce5c008c82..b989c96dc1ff 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -581,7 +581,7 @@ def test_structured_output_with_reasoning_matrices( max_num_seqs=16, backend=backend, structured_outputs_config=dict(disable_any_whitespace=True, - reasoning_backend=reasoning_parser), + reasoning_parser=reasoning_parser), tokenizer_mode=tokenizer_mode, speculative_config=speculative_config, ) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index f89e0985bc8a..bd4ad9b387fc 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3081,7 +3081,7 @@ class StructuredOutputsConfig: in the JSON schema. This is only supported for the `guidance` backend and is used to better align its behaviour with `outlines` and `xgrammar`.""" - reasoning_backend: str = "" + reasoning_parser: str = "" """Select the reasoning parser depending on the model that you're using. This is used to parse the reasoning content into OpenAI API format.""" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3cebfdf885be..88e06b2adf35 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1746,7 +1746,7 @@ async def init_app_state( enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, tool_server=tool_server, - reasoning_parser=args.reasoning_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, @@ -1765,7 +1765,7 @@ async def init_app_state( exclude_tools_when_tool_choice_none=args. exclude_tools_when_tool_choice_none, tool_parser=args.tool_call_parser, - reasoning_parser=args.reasoning_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, @@ -1868,10 +1868,10 @@ def validate_api_server_args(args): f"(chose from {{ {','.join(valid_tool_parses)} }})") valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys() - if args.reasoning_parser \ - and args.reasoning_parser not in valid_reasoning_parses: + if ((reasoning_parser := args.structured_outputs_config.reasoning_parser) + and reasoning_parser not in valid_reasoning_parses): raise KeyError( - f"invalid reasoning parser: {args.reasoning_parser} " + f"invalid reasoning parser: {reasoning_parser} " f"(chose from {{ {','.join(valid_reasoning_parses)} }})") diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 6159f5c9a359..d2063f962a8d 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -253,8 +253,8 @@ class GptOssForCausalLMConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: structured_outputs_config = vllm_config.structured_outputs_config - if structured_outputs_config.reasoning_backend == "": - structured_outputs_config.reasoning_backend = "GptOss" + if structured_outputs_config.reasoning_parser == "": + structured_outputs_config.reasoning_parser = "GptOss" # Increase the max capture size from 512 to 1024 for performance. # NOTE(woosuk): This will increase the number of CUDA graphs diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index c2bffc345d41..8ac5ea4129f7 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -64,11 +64,11 @@ def __init__(self, vllm_config: VllmConfig): scheduler_config=self.vllm_config.scheduler_config, lora_config=self.vllm_config.lora_config, ).get_lora_tokenizer(None) - reasoning_backend = \ - self.vllm_config.structured_outputs_config.reasoning_backend - if reasoning_backend: + reasoning_parser = \ + self.vllm_config.structured_outputs_config.reasoning_parser + if reasoning_parser: reasoner_cls = ReasoningParserManager.get_reasoning_parser( - reasoning_backend) + reasoning_parser) self.reasoner = reasoner_cls(tokenizer=self.tokenizer) def grammar_init(self, request: Request) -> None: From 48328d784694ba9da1c95613cf3df0eb2f8cf5af Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:58:22 +0200 Subject: [PATCH 16/43] Replace more instances of guided/guided decoding Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/openai/test_chat.py | 8 ++++---- .../openai/test_completion_with_function_calling.py | 2 +- tests/entrypoints/openai/test_prompt_validation.py | 2 +- .../openai/test_transcription_validation.py | 2 +- .../openai/test_translation_validation.py | 2 +- .../entrypoints/llm/test_struct_output_generate.py | 12 ++++++------ vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/transformers_utils/tokenizers/mistral.py | 2 +- .../structured_output/backend_lm_format_enforcer.py | 4 ++-- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index e2d438f76311..38015053867d 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# imports for guided decoding tests +# imports for structured outputs tests import json from typing import Optional @@ -489,7 +489,7 @@ async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI, sample_choices, is_v1_server: bool): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("Structured outputs is only supported in v1 engine") messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -529,7 +529,7 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, sample_json_schema, is_v1_server: bool): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("Structured outputs is only supported in v1 engine") messages = [{ "role": "system", @@ -575,7 +575,7 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI, async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI, sample_regex, is_v1_server: bool): if not is_v1_server: - pytest.skip("Guided decoding is only supported in v1 engine") + pytest.skip("Structured outputs is only supported in v1 engine") messages = [{ "role": "system", diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index daaf441abac1..3649cefa9bf4 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -225,7 +225,7 @@ def k2_server(): # noqa: F811 "--dtype", "half", "--enable-auto-tool-choice", - "--guided-decoding-backend", + "--structured-outputs-config.backend", "xgrammar", "--tool-call-parser", "hermes", diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 4197583074df..895149b8d969 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -3,7 +3,7 @@ import io -# imports for guided decoding tests +# imports for structured outputs tests import openai import pybase64 import pytest diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 6a3cdfdfc808..23c99da97ad3 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# imports for guided decoding tests +# imports for structured outputs tests import io import json diff --git a/tests/entrypoints/openai/test_translation_validation.py b/tests/entrypoints/openai/test_translation_validation.py index f43b7a253d28..eb7879927b9b 100644 --- a/tests/entrypoints/openai/test_translation_validation.py +++ b/tests/entrypoints/openai/test_translation_validation.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import io -# imports for guided decoding tests +# imports for structured outputs tests import json import httpx diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index b989c96dc1ff..52fac6173d23 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -762,14 +762,14 @@ def test_structured_output_batched_with_non_structured_outputs_requests( disable_any_whitespace=(backend in {"xgrammar", "guidance"})), ) - guided_prompt = ( + structured_outputs_prompt = ( "Give an example JSON for an employee profile that fits this " "schema. Make the response as short as possible. Schema: " f"{sample_json_schema}") - non_guided_prompt = "The diameter of the Earth in kilometers is " + non_structured_outputs_prompt = "The diameter of the Earth in kilometers is " - prompts = [guided_prompt, non_guided_prompt] + prompts = [structured_outputs_prompt, non_structured_outputs_prompt] sampling_params = [ SamplingParams(temperature=1.0, max_tokens=400, @@ -805,16 +805,16 @@ def test_structured_output_batched_with_non_structured_outputs_requests( print(f"Prompt:\n{prompt!r}\nGenerated text:\n{generated_text!r}") if index == 0: - # First prompt is guided, expect valid JSON + # First prompt is structured outputs, expect valid JSON assert "\n" not in generated_text output_json = json.loads(generated_text) jsonschema.validate(instance=output_json, schema=sample_json_schema) else: - # Second prompt is not guided, expect valid output + # Second prompt is not structured outputs, expect valid output # Cannot assert on exact output, but we can expect it to be factual assert "12,742" in generated_text - # non-guided requests should not return a valid JSON here + # non-structured outputs requests should not return a valid JSON here with pytest.raises(ValueError): output_json = json.loads(generated_text) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 35edd2f85cd0..5bee1c5a0d33 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -945,7 +945,7 @@ async def chat_completion_stream_generator( # check to make sure we haven't "forgotten" to stream # any tokens that were generated but previously # matched by partial json parsing - # only happens if we are NOT using guided decoding + # only happens if we are NOT using structured outputs auto_tools_called = False if tool_parser: auto_tools_called = len( diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 5a299b1adf34..479a24e2fee2 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -274,7 +274,7 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str, return tokenizer_file # the following attributes are set to fit vLLM's design and are used - # by the guided structured output backends. + # by the structured output backends. @property def all_special_tokens_extended(self) -> list[str]: from mistral_common.tokens.tokenizers.base import SpecialTokens diff --git a/vllm/v1/structured_output/backend_lm_format_enforcer.py b/vllm/v1/structured_output/backend_lm_format_enforcer.py index 5f9925d209fe..465b2428f893 100644 --- a/vllm/v1/structured_output/backend_lm_format_enforcer.py +++ b/vllm/v1/structured_output/backend_lm_format_enforcer.py @@ -157,11 +157,11 @@ def validate_structured_output_request_lm_format_enforcer( json.dumps(so_params.json) except Exception as e: raise ValueError( - f"Error serializing guided decoding jsonschema: {e}" + f"Error serializing structured outputs jsonschema: {e}" ) from e return elif so_params.choice: return elif so_params.grammar: - raise ValueError("LM Format Enforcer guided decoding backend " + raise ValueError("LM Format Enforcer structured outputs backend " "does not support grammar specifications") From 8b35c084eead3116132caefabc8619dff4884c13 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 13:44:04 +0200 Subject: [PATCH 17/43] Remove `StructuredOutputsParams.from_optional` as it's not necessary Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/openai/protocol.py | 75 +++++++++++++---------------- vllm/sampling_params.py | 27 ----------- 2 files changed, 34 insertions(+), 68 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 50375d64156d..a696864f1f62 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -347,8 +347,9 @@ def to_sampling_params( structured_outputs = None if self.text is not None and self.text.format is not None: response_format = self.text.format - if response_format.type == "json_schema": - structured_outputs = StructuredOutputsParams.from_optional( + if (response_format.type == "json_schema" + and response_format.schema_ is not None): + structured_outputs = StructuredOutputsParams( json=response_format.schema_) elif response_format.type == "json_object": raise NotImplementedError("json_object is not supported") @@ -639,31 +640,28 @@ def to_sampling_params( if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - structured_outputs = StructuredOutputsParams( - **(self.structured_outputs or {})) - if self.response_format is not None: - if self.response_format.type == "json_object": - structured_outputs.json_object = True - elif self.response_format.type == "json_schema": - json_schema = self.response_format.json_schema - assert json_schema is not None - structured_outputs.json = json_schema.json_schema - elif self.response_format.type == "structural_tag": - structural_tag = self.response_format - assert structural_tag is not None and isinstance( - structural_tag, StructuralTagResponseFormat) - s_tag_obj = structural_tag.model_dump(by_alias=True) - structured_outputs.structural_tag = json.dumps(s_tag_obj) - - structured_outputs = StructuredOutputsParams.from_optional( - json=self._get_json_schema_from_tool() or structured_outputs.json, - regex=structured_outputs.regex, - choice=structured_outputs.choice, - grammar=structured_outputs.grammar, - json_object=structured_outputs.json_object, - whitespace_pattern=structured_outputs.whitespace_pattern, - structural_tag=structured_outputs.structural_tag, - ) + structured_outputs = None + if (self.structured_outputs is not None + and any(v is not None + for v in self.structured_outputs.values())): + structured_outputs = StructuredOutputsParams( + **self.structured_outputs) + + if self.response_format is not None: + if self.response_format.type == "json_object": + structured_outputs.json_object = True + elif self.response_format.type == "json_schema": + json_schema = self.response_format.json_schema + assert json_schema is not None + structured_outputs.json = json_schema.json_schema + elif self.response_format.type == "structural_tag": + structural_tag = self.response_format + assert structural_tag is not None and isinstance( + structural_tag, StructuralTagResponseFormat) + s_tag_obj = structural_tag.model_dump(by_alias=True) + structured_outputs.structural_tag = json.dumps(s_tag_obj) + if json_schema := self._get_json_schema_from_tool(): + structured_outputs.json = json_schema extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -1125,20 +1123,15 @@ def to_sampling_params( echo_without_generation = self.echo and self.max_tokens == 0 - structured_outputs_kwargs = StructuredOutputsParams( - **(self.structured_outputs or {})) - if (self.response_format is not None - and self.response_format.type == "json_object"): - structured_outputs_kwargs.json_object = True - - structured_outputs = StructuredOutputsParams.from_optional( - json=structured_outputs_kwargs.json, - regex=structured_outputs_kwargs.regex, - choice=structured_outputs_kwargs.choice, - grammar=structured_outputs_kwargs.grammar, - json_object=structured_outputs_kwargs.json_object, - whitespace_pattern=structured_outputs_kwargs.whitespace_pattern, - ) + structured_outputs = None + if (self.structured_outputs is not None + and any(v is not None + for v in self.structured_outputs.values())): + structured_outputs = StructuredOutputsParams( + **self.structured_outputs) + if (self.response_format is not None + and self.response_format.type == "json_object"): + structured_outputs.json_object = True extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 398eb93d3ec3..94c8497028cc 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -8,7 +8,6 @@ from typing import Annotated, Any, Optional, Union import msgspec -from pydantic import BaseModel from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor @@ -42,32 +41,6 @@ class StructuredOutputsParams: whitespace_pattern: Optional[str] = None structural_tag: Optional[str] = None - @staticmethod - def from_optional( - json: Optional[Union[dict, BaseModel, str]] = None, - regex: Optional[str] = None, - choice: Optional[list[str]] = None, - grammar: Optional[str] = None, - json_object: Optional[bool] = None, - whitespace_pattern: Optional[str] = None, - structural_tag: Optional[str] = None, - ) -> Optional["StructuredOutputsParams"]: - if all(arg is None for arg in (json, regex, choice, grammar, - json_object, structural_tag)): - return None - # Extract json schemas from pydantic models - if isinstance(json, (BaseModel, type(BaseModel))): - json = json.model_json_schema() - return StructuredOutputsParams( - json=json, - regex=regex, - choice=choice, - grammar=grammar, - json_object=json_object, - whitespace_pattern=whitespace_pattern, - structural_tag=structural_tag, - ) - def __post_init__(self): """Validate that some fields are mutually exclusive.""" count = sum([ From f8947e12f5b92056c5b0b06de6306467da2ae62e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 13:46:44 +0200 Subject: [PATCH 18/43] Fix tests Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/llm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 4d64f231bcc4..d2ee7b15b641 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -248,6 +248,8 @@ def __init__( ))) else: structured_outputs_instance = structured_outputs_config + else: + structured_outputs_instance = StructuredOutputsConfig() engine_args = EngineArgs( model=model, From ec81c2bc4efbdfcec24277dbcb68ab06be256428 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 14:02:21 +0200 Subject: [PATCH 19/43] pre-commit Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/openai/protocol.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a696864f1f62..e91e46c333a9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -660,8 +660,8 @@ def to_sampling_params( structural_tag, StructuralTagResponseFormat) s_tag_obj = structural_tag.model_dump(by_alias=True) structured_outputs.structural_tag = json.dumps(s_tag_obj) - if json_schema := self._get_json_schema_from_tool(): - structured_outputs.json = json_schema + if structured_outputs_json := self._get_json_schema_from_tool(): + structured_outputs.json = structured_outputs_json extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -700,8 +700,7 @@ def to_sampling_params( extra_args=extra_args or None, ) - def _get_json_schema_from_tool( - self) -> Optional[Union[str, dict, BaseModel]]: + def _get_json_schema_from_tool(self) -> Optional[Union[str, dict]]: # user has chosen to not use any tool if self.tool_choice == "none" or self.tools is None: return None From b991f92d3e3ea44b6b099be21a6fb7a301b31c1c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 16:00:27 +0200 Subject: [PATCH 20/43] Fix tests Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 52fac6173d23..57490112f3df 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -579,8 +579,9 @@ def test_structured_output_with_reasoning_matrices( enforce_eager=bool(not current_platform.is_tpu()), max_model_len=1024, max_num_seqs=16, - backend=backend, - structured_outputs_config=dict(disable_any_whitespace=True, + structured_outputs_config=dict(backend=backend, + disable_any_whitespace=backend + not in {"xgrammar", "guidance"}, reasoning_parser=reasoning_parser), tokenizer_mode=tokenizer_mode, speculative_config=speculative_config, @@ -643,7 +644,7 @@ def test_structured_output_auto_mode( llm = LLM(model=model_name, max_model_len=1024, - backend="auto", + structured_outputs=dict(backend="auto"), tokenizer_mode=tokenizer_mode) sampling_params = SamplingParams( @@ -686,6 +687,7 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch): llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct", max_model_len=1024, structured_outputs_config=dict( + backend="guidance", disable_any_whitespace=True, disable_additional_properties=True)) From ce7390330dd8d4bd324a52638ff69ff2599dfb3f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 16:07:08 +0200 Subject: [PATCH 21/43] Fix test pipeline for removed file Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 55349e0ac932..e248e73ae954 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -109,8 +109,7 @@ steps: - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests From 37e77514b87d4c1e55f16286aebc71484b5a8908 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 2 Sep 2025 16:10:09 +0200 Subject: [PATCH 22/43] `--reasoning-parser` -> `--structured-outputs-config.reasoning_parser` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/features/reasoning_outputs.md | 10 +++++----- docs/features/structured_outputs.md | 2 +- docs/features/tool_calling.md | 2 +- ...openai_chat_completion_tool_calls_with_reasoning.py | 2 +- .../openai_chat_completion_with_reasoning.py | 2 +- .../openai_chat_completion_with_reasoning_streaming.py | 2 +- examples/online_serving/structured_outputs/README.md | 2 +- .../openai/test_chat_with_tool_reasoning.py | 6 +++--- tests/entrypoints/openai/test_cli_args.py | 4 ++-- .../openai/test_completion_with_function_calling.py | 4 ++-- tests/v1/entrypoints/openai/responses/conftest.py | 2 +- 11 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 4b4422f4bf1f..377bc212797c 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -22,11 +22,11 @@ vLLM currently supports the following reasoning models: ## Quickstart -To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. +To use reasoning models, you need to specify the `--structured-outputs-config.reasoning_parser` flags when making a request to the chat completion endpoint. The `--structured-outputs-config.reasoning_parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --reasoning-parser deepseek_r1 + --structured-outputs-config.reasoning_parser deepseek_r1 ``` Next, make a request to the model that should return the reasoning content in the response. @@ -208,7 +208,7 @@ You can add a new `ReasoningParser` similar to --reasoning-parser example +vllm serve --structured-outputs-config.reasoning_parser example ``` diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 1f955c6e30d6..af23d7108975 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -161,7 +161,7 @@ See also: [full example](../examples/online_serving/structured_outputs.md) You can also use structured outputs with for reasoning models. ```bash -vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1 +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --structured-outputs-config.reasoning_parser deepseek_r1 ``` Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 35b01ef55b19..e814e3b497aa 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -308,7 +308,7 @@ Supported models: Flags: * For non-reasoning: `--tool-call-parser hunyuan_a13b` -* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning` +* For reasoning: `--tool-call-parser hunyuan_a13b --structured-outputs-config.reasoning_parser hunyuan_a13b --enable_reasoning` ### Models with Pythonic Tool Calls (`pythonic`) diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index 4006d07f73b0..cd3a7eb2b51f 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -10,7 +10,7 @@ ```bash vllm serve Qwen/QwQ-32B \ - --reasoning-parser deepseek_r1 \ + --structured-outputs-config.reasoning_parser deepseek_r1 \ --enable-auto-tool-choice --tool-call-parser hermes ``` diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index 932dbeb2e7a2..8b934704442f 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -9,7 +9,7 @@ ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --reasoning-parser deepseek_r1 + --structured-outputs-config.reasoning_parser deepseek_r1 ``` This example demonstrates how to generate chat completions from reasoning models diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index 7d1ea3771459..e952bf7ab0f4 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -9,7 +9,7 @@ ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --reasoning-parser deepseek_r1 + --structured-outputs-config.reasoning_parser deepseek_r1 ``` Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md index d2777a43d478..bde4fa105808 100644 --- a/examples/online_serving/structured_outputs/README.md +++ b/examples/online_serving/structured_outputs/README.md @@ -14,7 +14,7 @@ To serve a reasoning model, you can use the following command: ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ - --reasoning-parser deepseek_r1 + --structured-outputs-config.reasoning_parser deepseek_r1 ``` If you want to run this script standalone with `uv`, you can use the following: diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index 03730b67283c..6d7067159894 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -14,9 +14,9 @@ @pytest.fixture(scope="module") def server(): # noqa: F811 args = [ - "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser", - "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser", - "hermes" + "--max-model-len", "8192", "--enforce-eager", + "--structured-outputs-config.reasoning_parser", "deepseek_r1", + "--enable-auto-tool-choice", "--tool-call-parser", "hermes" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 9a1c0ea13b54..f22008f44d15 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -145,7 +145,7 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser): """Ensure validation fails if reasoning is enabled with auto tool choice""" args = serve_parser.parse_args(args=[ "--enable-auto-tool-choice", - "--reasoning-parser", + "--structured-outputs-config.reasoning_parser", "deepseek_r1", ]) with pytest.raises(TypeError): @@ -156,7 +156,7 @@ def test_passes_with_reasoning_parser(serve_parser): """Ensure validation passes if reasoning is enabled with a reasoning parser""" args = serve_parser.parse_args(args=[ - "--reasoning-parser", + "--structured-outputs-config.reasoning_parser", "deepseek_r1", ]) validate_parsed_serve_args(args) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 3649cefa9bf4..8025f78e2c61 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -146,7 +146,7 @@ def server(): # noqa: F811 "xgrammar", "--tool-call-parser", "hermes", - "--reasoning-parser", + "--structured-outputs-config.reasoning_parser", "qwen3", "--gpu-memory-utilization", "0.4" @@ -229,7 +229,7 @@ def k2_server(): # noqa: F811 "xgrammar", "--tool-call-parser", "hermes", - "--reasoning-parser", + "--structured-outputs-config.reasoning_parser", "qwen3", "--gpu-memory-utilization", "0.4", diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py index 2d677a00b646..f33e590b7296 100644 --- a/tests/v1/entrypoints/openai/responses/conftest.py +++ b/tests/v1/entrypoints/openai/responses/conftest.py @@ -15,7 +15,7 @@ def default_server_args(): "--max-model-len", "8192", "--enforce-eager", # For faster startup. - "--reasoning-parser", + "--structured-outputs-config.reasoning_parser", "deepseek_r1", ] From f85abd756820b40428b9d263be752f61ef9d5c38 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 14:35:37 +0200 Subject: [PATCH 23/43] Add reasoning parser back as an `InitVar` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index dcaa534b5850..ca67a90dfe63 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -8,7 +8,7 @@ import functools import json import sys -from dataclasses import MISSING, dataclass, fields, is_dataclass +from dataclasses import MISSING, InitVar, dataclass, fields, is_dataclass from itertools import permutations from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List, Literal, Optional, Type, TypeVar, Union, cast, get_args, @@ -39,6 +39,7 @@ from vllm.platforms import CpuArchEnum, current_platform from vllm.plugins import load_general_plugins from vllm.ray.lazy_utils import is_ray_initialized +from vllm.reasoning import ReasoningParserManager from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.config import get_model_path, is_interleaved from vllm.transformers_utils.utils import check_gguf_file @@ -415,6 +416,7 @@ class EngineArgs: structured_outputs_config: StructuredOutputsConfig = get_field( VllmConfig, "structured_outputs_config") + reasoning_parser: InitVar[str] = StructuredOutputsConfig.reasoning_parser logits_processor_pattern: Optional[ str] = ModelConfig.logits_processor_pattern @@ -470,7 +472,8 @@ class EngineArgs: kv_sharing_fast_prefill: bool = \ CacheConfig.kv_sharing_fast_prefill - def __post_init__(self): + def __post_init__(self, reasoning_parser: str): + self.structured_outputs_config.reasoning_parser = reasoning_parser # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object @@ -609,6 +612,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: load_group.add_argument('--pt-load-map-location', **load_kwargs["pt_load_map_location"]) + # Structured outputs arguments + structured_outputs_kwargs = get_kwargs(StructuredOutputsConfig) + structured_outputs_group = parser.add_argument_group( + title="StructuredOutputsConfig", + description=StructuredOutputsConfig.__doc__, + ) + structured_outputs_group.add_argument( + "--reasoning-parser", + # This choice is a special case because it's not static + choices=list(ReasoningParserManager.reasoning_parsers), + **structured_outputs_kwargs["reasoning_parser"]) # Parallel arguments parallel_kwargs = get_kwargs(ParallelConfig) parallel_group = parser.add_argument_group( From 85b3b4305feda7a306f404555340d1c16b88907a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 14:37:53 +0200 Subject: [PATCH 24/43] `pre-commit` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/llm_engine.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ee211fcc1b01..03f96e196e9b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -213,6 +213,7 @@ def __init__( self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config # noqa self.load_config = vllm_config.load_config + self.structured_outputs_config = vllm_config.structured_outputs_config self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa ) @@ -371,10 +372,9 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: self.observability_config.otlp_traces_endpoint) # Initialize reasoning parser if reasoning backend is set. - if self.decoding_config.reasoning_backend and \ - self.tokenizer: + if self.structured_outputs_config.reasoning_parser and self.tokenizer: reasoner_class = ReasoningParserManager.get_reasoning_parser( - self.decoding_config.reasoning_backend) + self.structured_outputs_config.reasoning_parser) self.reasoner: ReasoningParser = reasoner_class( self.tokenizer.get_lora_tokenizer()) @@ -390,7 +390,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: stop_checker=StopChecker( self.scheduler_config.max_model_len, get_tokenizer_for_seq, - self.reasoner if self.decoding_config.reasoning_backend + self.reasoner + if self.structured_outputs_config.reasoning_parser and self.tokenizer else None, ), )) From b4f70f73f9740a14001aed92813eef67f7890375 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 14:41:46 +0200 Subject: [PATCH 25/43] Reinstate deprecated guided decoding CLI args Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ca67a90dfe63..126a1b74e3e0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -417,6 +417,11 @@ class EngineArgs: structured_outputs_config: StructuredOutputsConfig = get_field( VllmConfig, "structured_outputs_config") reasoning_parser: InitVar[str] = StructuredOutputsConfig.reasoning_parser + # Deprecated guided decoding fields + guided_decoding_backend: str = None + guided_decoding_disable_fallback: bool = None + guided_decoding_disable_any_whitespace: bool = None + guided_decoding_disable_additional_properties: bool = None logits_processor_pattern: Optional[ str] = ModelConfig.logits_processor_pattern @@ -623,6 +628,19 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: # This choice is a special case because it's not static choices=list(ReasoningParserManager.reasoning_parsers), **structured_outputs_kwargs["reasoning_parser"]) + # Deprecated guided decoding arguments + for arg, type in [ + ("--guided-decoding-backend", str), + ("--guided-decoding-disable-fallback", bool), + ("--guided-decoding-disable-any-whitespace", bool), + ("--guided-decoding-disable-additional-properties", bool), + ]: + structured_outputs_group.add_argument( + arg, + type=type, + help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."), + deprecated=True) + # Parallel arguments parallel_kwargs = get_kwargs(ParallelConfig) parallel_group = parser.add_argument_group( @@ -1399,6 +1417,21 @@ def create_engine_config( load_config = self.create_load_config() + # Forward the deprecated CLI args to the StructuredOutputsConfig + so_config = self.structured_outputs_config + if self.guided_decoding_backend is not None: + so_config.guided_decoding_backend = \ + self.guided_decoding_backend + if self.guided_decoding_disable_fallback is not None: + so_config.guided_decoding_disable_fallback = \ + self.guided_decoding_disable_fallback + if self.guided_decoding_disable_any_whitespace is not None: + so_config.guided_decoding_disable_any_whitespace = \ + self.guided_decoding_disable_any_whitespace + if self.guided_decoding_disable_additional_properties is not None: + so_config.guided_decoding_disable_additional_properties = \ + self.guided_decoding_disable_additional_properties + observability_config = ObservabilityConfig( show_hidden_metrics_for_version=( self.show_hidden_metrics_for_version), From b52aab9d711ec599c003fafb9a53bf040b733ea4 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 14:51:35 +0200 Subject: [PATCH 26/43] `--structured-outputs-config.reasoning_parser` -> `--reasoning-parser` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/features/reasoning_outputs.md | 10 +++++----- docs/features/structured_outputs.md | 2 +- docs/features/tool_calling.md | 2 +- ...openai_chat_completion_tool_calls_with_reasoning.py | 2 +- .../openai_chat_completion_with_reasoning.py | 2 +- .../openai_chat_completion_with_reasoning_streaming.py | 2 +- examples/online_serving/structured_outputs/README.md | 2 +- .../openai/test_chat_with_tool_reasoning.py | 6 +++--- tests/entrypoints/openai/test_cli_args.py | 4 ++-- .../openai/test_completion_with_function_calling.py | 4 ++-- tests/v1/entrypoints/openai/responses/conftest.py | 2 +- 11 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md index 329b174af49e..85681669dfb2 100644 --- a/docs/features/reasoning_outputs.md +++ b/docs/features/reasoning_outputs.md @@ -23,11 +23,11 @@ vLLM currently supports the following reasoning models: ## Quickstart -To use reasoning models, you need to specify the `--structured-outputs-config.reasoning_parser` flags when making a request to the chat completion endpoint. The `--structured-outputs-config.reasoning_parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. +To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --structured-outputs-config.reasoning_parser deepseek_r1 + --reasoning-parser deepseek_r1 ``` Next, make a request to the model that should return the reasoning content in the response. @@ -209,7 +209,7 @@ You can add a new `ReasoningParser` similar to --structured-outputs-config.reasoning_parser example +vllm serve --reasoning-parser example ``` diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index af23d7108975..1f955c6e30d6 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -161,7 +161,7 @@ See also: [full example](../examples/online_serving/structured_outputs.md) You can also use structured outputs with for reasoning models. ```bash -vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --structured-outputs-config.reasoning_parser deepseek_r1 +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r1 ``` Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema: diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 01f9ad62908c..720102ff9ea3 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -308,7 +308,7 @@ Supported models: Flags: * For non-reasoning: `--tool-call-parser hunyuan_a13b` -* For reasoning: `--tool-call-parser hunyuan_a13b --structured-outputs-config.reasoning_parser hunyuan_a13b --enable_reasoning` +* For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning` ### GLM-4.5 Models (`glm45`) diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index cd3a7eb2b51f..4006d07f73b0 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -10,7 +10,7 @@ ```bash vllm serve Qwen/QwQ-32B \ - --structured-outputs-config.reasoning_parser deepseek_r1 \ + --reasoning-parser deepseek_r1 \ --enable-auto-tool-choice --tool-call-parser hermes ``` diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index 8b934704442f..932dbeb2e7a2 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -9,7 +9,7 @@ ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --structured-outputs-config.reasoning_parser deepseek_r1 + --reasoning-parser deepseek_r1 ``` This example demonstrates how to generate chat completions from reasoning models diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index e952bf7ab0f4..7d1ea3771459 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -9,7 +9,7 @@ ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ - --structured-outputs-config.reasoning_parser deepseek_r1 + --reasoning-parser deepseek_r1 ``` Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the diff --git a/examples/online_serving/structured_outputs/README.md b/examples/online_serving/structured_outputs/README.md index bde4fa105808..d2777a43d478 100644 --- a/examples/online_serving/structured_outputs/README.md +++ b/examples/online_serving/structured_outputs/README.md @@ -14,7 +14,7 @@ To serve a reasoning model, you can use the following command: ```bash vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \ - --structured-outputs-config.reasoning_parser deepseek_r1 + --reasoning-parser deepseek_r1 ``` If you want to run this script standalone with `uv`, you can use the following: diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index 6d7067159894..03730b67283c 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -14,9 +14,9 @@ @pytest.fixture(scope="module") def server(): # noqa: F811 args = [ - "--max-model-len", "8192", "--enforce-eager", - "--structured-outputs-config.reasoning_parser", "deepseek_r1", - "--enable-auto-tool-choice", "--tool-call-parser", "hermes" + "--max-model-len", "8192", "--enforce-eager", "--reasoning-parser", + "deepseek_r1", "--enable-auto-tool-choice", "--tool-call-parser", + "hermes" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index f22008f44d15..9a1c0ea13b54 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -145,7 +145,7 @@ def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser): """Ensure validation fails if reasoning is enabled with auto tool choice""" args = serve_parser.parse_args(args=[ "--enable-auto-tool-choice", - "--structured-outputs-config.reasoning_parser", + "--reasoning-parser", "deepseek_r1", ]) with pytest.raises(TypeError): @@ -156,7 +156,7 @@ def test_passes_with_reasoning_parser(serve_parser): """Ensure validation passes if reasoning is enabled with a reasoning parser""" args = serve_parser.parse_args(args=[ - "--structured-outputs-config.reasoning_parser", + "--reasoning-parser", "deepseek_r1", ]) validate_parsed_serve_args(args) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 8025f78e2c61..3649cefa9bf4 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -146,7 +146,7 @@ def server(): # noqa: F811 "xgrammar", "--tool-call-parser", "hermes", - "--structured-outputs-config.reasoning_parser", + "--reasoning-parser", "qwen3", "--gpu-memory-utilization", "0.4" @@ -229,7 +229,7 @@ def k2_server(): # noqa: F811 "xgrammar", "--tool-call-parser", "hermes", - "--structured-outputs-config.reasoning_parser", + "--reasoning-parser", "qwen3", "--gpu-memory-utilization", "0.4", diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py index f33e590b7296..2d677a00b646 100644 --- a/tests/v1/entrypoints/openai/responses/conftest.py +++ b/tests/v1/entrypoints/openai/responses/conftest.py @@ -15,7 +15,7 @@ def default_server_args(): "--max-model-len", "8192", "--enforce-eager", # For faster startup. - "--structured-outputs-config.reasoning_parser", + "--reasoning-parser", "deepseek_r1", ] From ac75b2a6fb050984a03cfbd0362a46e6686109e5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 14:57:10 +0200 Subject: [PATCH 27/43] `InitVar` didn't work Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 126a1b74e3e0..012a917f975a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -8,7 +8,7 @@ import functools import json import sys -from dataclasses import MISSING, InitVar, dataclass, fields, is_dataclass +from dataclasses import MISSING, dataclass, fields, is_dataclass from itertools import permutations from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List, Literal, Optional, Type, TypeVar, Union, cast, get_args, @@ -416,7 +416,7 @@ class EngineArgs: structured_outputs_config: StructuredOutputsConfig = get_field( VllmConfig, "structured_outputs_config") - reasoning_parser: InitVar[str] = StructuredOutputsConfig.reasoning_parser + reasoning_parser: str = StructuredOutputsConfig.reasoning_parser # Deprecated guided decoding fields guided_decoding_backend: str = None guided_decoding_disable_fallback: bool = None @@ -477,8 +477,7 @@ class EngineArgs: kv_sharing_fast_prefill: bool = \ CacheConfig.kv_sharing_fast_prefill - def __post_init__(self, reasoning_parser: str): - self.structured_outputs_config.reasoning_parser = reasoning_parser + def __post_init__(self): # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object @@ -1417,6 +1416,11 @@ def create_engine_config( load_config = self.create_load_config() + # Pass reasoning_parser into StructuredOutputsConfig + if self.reasoning_parser: + self.structured_outputs_config.reasoning_parser = \ + self.reasoning_parser + # Forward the deprecated CLI args to the StructuredOutputsConfig so_config = self.structured_outputs_config if self.guided_decoding_backend is not None: From 292903653b5f46b75c119ed309138b56a3aef9e7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 14:57:43 +0200 Subject: [PATCH 28/43] `pre-commit` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/engine/arg_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 012a917f975a..5ff28dd4e248 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -418,10 +418,10 @@ class EngineArgs: VllmConfig, "structured_outputs_config") reasoning_parser: str = StructuredOutputsConfig.reasoning_parser # Deprecated guided decoding fields - guided_decoding_backend: str = None - guided_decoding_disable_fallback: bool = None - guided_decoding_disable_any_whitespace: bool = None - guided_decoding_disable_additional_properties: bool = None + guided_decoding_backend: Optional[str] = None + guided_decoding_disable_fallback: Optional[bool] = None + guided_decoding_disable_any_whitespace: Optional[bool] = None + guided_decoding_disable_additional_properties: Optional[bool] = None logits_processor_pattern: Optional[ str] = ModelConfig.logits_processor_pattern From 295ac17ec036e9c34b51e3baab205d8388e6a98e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:06:10 +0200 Subject: [PATCH 29/43] `sample_choices` -> `sample_structured_outputs_choices` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/conftest.py | 2 +- tests/entrypoints/openai/test_chat.py | 76 ++++++++++++++++--- tests/entrypoints/openai/test_completion.py | 7 +- tests/v1/entrypoints/conftest.py | 2 +- .../llm/test_struct_output_generate.py | 7 +- 5 files changed, 76 insertions(+), 18 deletions(-) diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index 88591b5eba09..30f2d67588fe 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -184,7 +184,7 @@ def sample_enum_json_schema(): @pytest.fixture -def sample_choices(): +def sample_structured_outputs_choices(): return [ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", "Swift", "Kotlin" diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 35a896d0951e..08c5b37e683b 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -12,7 +12,7 @@ import regex as re import requests import torch -from openai import BadRequestError +from openai import BadRequestError, OpenAI from ...utils import RemoteOpenAIServer @@ -485,9 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI, - sample_choices, - is_v1_server: bool): +async def test_structured_outputs_choice_chat( + client: openai.AsyncOpenAI, sample_structured_outputs_choices, + is_v1_server: bool): if not is_v1_server: pytest.skip("Structured outputs is only supported in v1 engine") messages = [{ @@ -504,9 +504,10 @@ async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(structured_outputs={"choice": sample_choices})) + extra_body=dict( + structured_outputs={"choice": sample_structured_outputs_choices})) choice1 = chat_completion.choices[0].message.content - assert choice1 in sample_choices + assert choice1 in sample_structured_outputs_choices messages.append({"role": "assistant", "content": choice1}) messages.append({ @@ -518,9 +519,10 @@ async def test_structured_outputs_choice_chat(client: openai.AsyncOpenAI, messages=messages, max_completion_tokens=10, temperature=0.7, - extra_body=dict(structured_outputs={"choice": sample_choices})) + extra_body=dict( + structured_outputs={"choice": sample_structured_outputs_choices})) choice2 = chat_completion.choices[0].message.content - assert choice2 in sample_choices + assert choice2 in sample_structured_outputs_choices assert choice1 != choice2 @@ -633,7 +635,7 @@ async def test_structured_outputs_type_error(client: openai.AsyncOpenAI): @pytest.mark.asyncio async def test_structured_outputs_choice_chat_logprobs( - client: openai.AsyncOpenAI, sample_choices): + client: openai.AsyncOpenAI, sample_structured_outputs_choices): messages = [{ "role": "system", @@ -650,7 +652,8 @@ async def test_structured_outputs_choice_chat_logprobs( max_completion_tokens=10, logprobs=True, top_logprobs=5, - extra_body=dict(structured_outputs={"choice": sample_choices})) + extra_body=dict( + structured_outputs={"choice": sample_structured_outputs_choices})) assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs.content is not None @@ -972,6 +975,59 @@ async def test_long_seed(client: openai.AsyncOpenAI): or "less_than_equal" in exc_info.value.message) +@pytest.mark.asyncio +async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer): + url = f"http://localhost:{server.port}/v1/chat/completions" + headers = { + "Content-Type": "application/json", + } + data = { + # model_name is avoided here. + "messages": [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "what is 1+1?" + }], + "max_tokens": + 5 + } + + response = requests.post(url, headers=headers, json=data) + response_data = response.json() + print(response_data) + assert response_data.get("model") == MODEL_NAME + choice = response_data.get("choices")[0] + message = choice.get("message") + assert message is not None + content = message.get("content") + assert content is not None + assert len(content) > 0 + + +@pytest.mark.asyncio +async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer): + openai_api_key = "EMPTY" + openai_api_base = f"http://localhost:{server.port}/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + messages = [ + { + "role": "user", + "content": "Hello, vLLM!" + }, + ] + response = client.chat.completions.create( + model="", # empty string + messages=messages, + ) + assert response.model == MODEL_NAME + + @pytest.mark.asyncio async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI): diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index e0fa8f2a9cce..aca88399e1f2 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -659,7 +659,7 @@ async def test_structured_outputs_regex_completion( @pytest.mark.asyncio async def test_structured_outputs_choice_completion( client: openai.AsyncOpenAI, - sample_choices, + sample_structured_outputs_choices, is_v1_server: bool, ): if not is_v1_server: @@ -671,12 +671,13 @@ async def test_structured_outputs_choice_completion( n=2, temperature=1.0, max_tokens=10, - extra_body=dict(structured_outputs=dict(choice=sample_choices))) + extra_body=dict(structured_outputs=dict( + choice=sample_structured_outputs_choices))) assert completion.id is not None assert len(completion.choices) == 2 for i in range(2): - assert completion.choices[i].text in sample_choices + assert completion.choices[i].text in sample_structured_outputs_choices @pytest.mark.asyncio diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py index 08d50e3fc928..46b953fe3743 100644 --- a/tests/v1/entrypoints/conftest.py +++ b/tests/v1/entrypoints/conftest.py @@ -151,7 +151,7 @@ def sample_definition_json_schema(): @pytest.fixture -def sample_choices(): +def sample_structured_outputs_choices(): return [ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", "Swift", "Kotlin" diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index b0690815c2d1..4db4ba4fca83 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -100,7 +100,7 @@ def test_structured_output( sample_sql_ebnf: str, sample_sql_lark: str, sample_regex: str, - sample_choices: str, + sample_structured_outputs_choices: str, backend: str, tokenizer_mode: str, model_name: str, @@ -356,7 +356,8 @@ def test_structured_output( sampling_params = SamplingParams( temperature=0.8, top_p=0.95, - structured_outputs=StructuredOutputsParams(choice=sample_choices)) + structured_outputs=StructuredOutputsParams( + choice=sample_structured_outputs_choices)) outputs = llm.generate( ("The best language for type-safe systems programming is " @@ -372,7 +373,7 @@ def test_structured_output( generated_text = output.outputs[0].text print(generated_text) assert generated_text is not None - assert generated_text in sample_choices + assert generated_text in sample_structured_outputs_choices print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") # From 6954712590bd2fbbfdcc728c9785a27274f3edb8 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:08:26 +0200 Subject: [PATCH 30/43] Update mergify path Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .github/mergify.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 7448b2de94a3..94198b1251e0 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -171,6 +171,7 @@ pull_request_rules: - files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files~=^tests/v1/structured_output/ + - files=tests/v1/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ actions: label: From 49b6d893bbb995335cd361868497fac1a34d9f77 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:10:38 +0200 Subject: [PATCH 31/43] Fix wrong kwarg Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 4db4ba4fca83..d0e0d4e77896 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -646,7 +646,7 @@ def test_structured_output_auto_mode( llm = LLM(model=model_name, max_model_len=1024, - structured_outputs=dict(backend="auto"), + structured_outputs_config=dict(backend="auto"), tokenizer_mode=tokenizer_mode) sampling_params = SamplingParams( From e36177256a76d5bcc8fceb58ec994d38ea1073b6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:15:28 +0200 Subject: [PATCH 32/43] Simplify dict -> config Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/llm.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 8ec37d183c20..19b3e902dc37 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -238,9 +238,12 @@ def __init__( compilation_config_instance = CompilationConfig( level=compilation_config) elif isinstance(compilation_config, dict): - predicate = lambda x: is_init_field(CompilationConfig, x[0]) compilation_config_instance = CompilationConfig( - **dict(filter(predicate, compilation_config.items()))) + **{ + k: v + for k, v in compilation_config.items() + if is_init_field(CompilationConfig, k) + }) else: compilation_config_instance = compilation_config else: @@ -248,13 +251,12 @@ def __init__( if structured_outputs_config is not None: if isinstance(structured_outputs_config, dict): - predicate = lambda x: is_init_field(StructuredOutputsConfig, x[ - 0]) - structured_outputs_instance = StructuredOutputsConfig(**dict( - filter( - predicate, - structured_outputs_config.items(), - ))) + structured_outputs_instance = StructuredOutputsConfig( + **{ + k: v + for k, v in structured_outputs_config.items() + if is_init_field(StructuredOutputsConfig, k) + }) else: structured_outputs_instance = structured_outputs_config else: From 94a2e74b0e7458c9ab315d7db59e7cdbce752061 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:20:11 +0200 Subject: [PATCH 33/43] Simplify request validation Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/openai/protocol.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index c12aec4fe164..9c402340bf98 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -814,15 +814,9 @@ def check_structured_outputs_count(cls, data): return data structured_outputs_kwargs = data['structured_outputs'] - - count = sum([ - "json" in structured_outputs_kwargs - and structured_outputs_kwargs["json"] is not None, - "regex" in structured_outputs_kwargs - and structured_outputs_kwargs["regex"] is not None, - "choice" in structured_outputs_kwargs - and structured_outputs_kwargs["choice"] is not None - ]) + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice")) # you can only use one kind of constraints for structured outputs if count > 1: raise ValueError( @@ -1175,14 +1169,9 @@ def check_structured_outputs_count(cls, data): return data structured_outputs_kwargs = data['structured_outputs'] - count = sum([ - "json" in structured_outputs_kwargs - and structured_outputs_kwargs["json"] is not None, - "regex" in structured_outputs_kwargs - and structured_outputs_kwargs["regex"] is not None, - "choice" in structured_outputs_kwargs - and structured_outputs_kwargs["choice"] is not None - ]) + count = sum( + structured_outputs_kwargs.get(k) is not None + for k in ("json", "regex", "choice")) if count > 1: raise ValueError( "You can only use one kind of constraints for structured " From 02ef1a7e72c1fb5144cfb9fd3be8d249f78756ee Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 19:20:10 +0200 Subject: [PATCH 34/43] Small typo Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/features/tool_calling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 720102ff9ea3..2a48596571d1 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -83,7 +83,7 @@ For more advanced usage, including parallel tool calls and different model-speci ## Named Function Calling -vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backend supported by vLLM. You are guaranteed a validly-parsable function call - not a +vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a high-quality one. vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. From a2b0c18cd4bb1021c2cbe071e9541f44a7e0e8c2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 20:00:02 +0200 Subject: [PATCH 35/43] Fix type checking of `structured_outputs` in protocol Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/openai/protocol.py | 38 +++++++++++------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 9c402340bf98..f893e3449f6f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -519,7 +519,7 @@ class ChatCompletionRequest(OpenAIBaseModel): default=None, description=("Additional kwargs to pass to the HF processor."), ) - structured_outputs: Optional[dict[str, Any]] = Field( + structured_outputs: Optional[StructuredOutputsParams] = Field( default=None, description="Additional kwargs for structured outputs", ) @@ -640,28 +640,23 @@ def to_sampling_params( if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - structured_outputs = None - if (self.structured_outputs is not None - and any(v is not None - for v in self.structured_outputs.values())): - structured_outputs = StructuredOutputsParams( - **self.structured_outputs) - + if self.structured_outputs is not None: if self.response_format is not None: if self.response_format.type == "json_object": - structured_outputs.json_object = True + self.structured_outputs.json_object = True elif self.response_format.type == "json_schema": json_schema = self.response_format.json_schema assert json_schema is not None - structured_outputs.json = json_schema.json_schema + self.structured_outputs.json = json_schema.json_schema elif self.response_format.type == "structural_tag": structural_tag = self.response_format assert structural_tag is not None and isinstance( structural_tag, StructuralTagResponseFormat) s_tag_obj = structural_tag.model_dump(by_alias=True) - structured_outputs.structural_tag = json.dumps(s_tag_obj) + self.structured_outputs.structural_tag = json.dumps( + s_tag_obj) if structured_outputs_json := self._get_json_schema_from_tool(): - structured_outputs.json = structured_outputs_json + self.structured_outputs.json = structured_outputs_json extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -693,9 +688,9 @@ def to_sampling_params( truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, - structured_outputs=structured_outputs, + structured_outputs=self.structured_outputs, logit_bias=self.logit_bias, - bad_words= self.bad_words, + bad_words=self.bad_words, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, ) @@ -983,7 +978,7 @@ class CompletionRequest(OpenAIBaseModel): ", {'type': 'structural_tag'}, or {'type': 'text' } is supported." ), ) - structured_outputs: Optional[dict[str, Any]] = Field( + structured_outputs: Optional[StructuredOutputsParams] = Field( default=None, description="Additional kwargs for structured outputs", ) @@ -1116,15 +1111,10 @@ def to_sampling_params( echo_without_generation = self.echo and self.max_tokens == 0 - structured_outputs = None if (self.structured_outputs is not None - and any(v is not None - for v in self.structured_outputs.values())): - structured_outputs = StructuredOutputsParams( - **self.structured_outputs) - if (self.response_format is not None - and self.response_format.type == "json_object"): - structured_outputs.json_object = True + and self.response_format is not None + and self.response_format.type == "json_object"): + self.structured_outputs.json_object = True extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -1156,7 +1146,7 @@ def to_sampling_params( truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream \ else RequestOutputKind.FINAL_ONLY, - structured_outputs=structured_outputs, + structured_outputs=self.structured_outputs, logit_bias=self.logit_bias, allowed_token_ids=self.allowed_token_ids, extra_args=extra_args or None, From 176ecce61b5e10be62f356a41aef4fafe6ab1f73 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 22:41:31 +0200 Subject: [PATCH 36/43] Fix incorrect condition for enabling disable_any_whitespace in test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index d0e0d4e77896..abc7973aee13 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -583,7 +583,7 @@ def test_structured_output_with_reasoning_matrices( max_num_seqs=16, structured_outputs_config=dict(backend=backend, disable_any_whitespace=backend - not in {"xgrammar", "guidance"}, + in {"xgrammar", "guidance"}, reasoning_parser=reasoning_parser), tokenizer_mode=tokenizer_mode, speculative_config=speculative_config, @@ -763,7 +763,8 @@ def test_structured_output_batched_with_non_structured_outputs_requests( max_model_len=1024, structured_outputs_config=StructuredOutputsConfig( backend=backend, - disable_any_whitespace=(backend in {"xgrammar", "guidance"})), + disable_any_whitespace=backend in {"xgrammar", "guidance"}, + ), ) structured_outputs_prompt = ( From 553d7a5d3c642911ef2bb00322e8ea54ba44d004 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 16 Sep 2025 23:30:44 +0200 Subject: [PATCH 37/43] Fix opinionated backend selection when `backend="auto"` Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/sampling_params.py | 10 +++++++--- vllm/v1/engine/processor.py | 8 ++++++++ vllm/v1/structured_output/__init__.py | 3 ++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 0100c7ccc646..cac7f72a72d3 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -2,12 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampling parameters for text generation.""" import copy -from dataclasses import dataclass +from dataclasses import field from enum import Enum, IntEnum from functools import cached_property from typing import Annotated, Any, Optional, Union import msgspec +from pydantic.dataclasses import dataclass from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor @@ -28,19 +29,22 @@ class SamplingType(IntEnum): # maybe make msgspec? @dataclass class StructuredOutputsParams: - """One of these fields will be used to build a logit processor.""" + # One of these fields will be used to build a logit processor. json: Optional[Union[str, dict]] = None regex: Optional[str] = None choice: Optional[list[str]] = None grammar: Optional[str] = None json_object: Optional[bool] = None - """These are other options that can be set""" + # These are other options that can be set. disable_fallback: bool = False disable_any_whitespace: bool = False disable_additional_properties: bool = False whitespace_pattern: Optional[str] = None structural_tag: Optional[str] = None + _backend: Optional[str] = field(default=None, init=False) + """CAUTION: Should only be set by Processor._validate_structured_output""" + def __post_init__(self): """Validate that some fields are mutually exclusive.""" count = sum([ diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 045470a81c0d..717a5ba64d37 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -216,6 +216,12 @@ def _validate_structured_output(self, params: SamplingParams) -> None: ) backend = self.structured_outputs_config.backend + if params.structured_outputs._backend and backend != "auto": + raise ValueError( + "StructuredOutputsParams._backend should only be set here if " + "StructuredOutputsConfig.backend is 'auto'.") + else: + params.structured_outputs._backend = backend # Request content validation if (isinstance(params.structured_outputs.choice, list) @@ -249,11 +255,13 @@ def _validate_structured_output(self, params: SamplingParams) -> None: # other setting where a specific backend was specified. try: validate_xgrammar_grammar(params) + params.structured_outputs._backend = "xgrammar" except ValueError: # The request either failed validation # or includes some jsonschema feature(s) that # are not supported in xgrammar. Fall back to guidance. validate_guidance_grammar(params, tokenizer=None) + params.structured_outputs._backend = "guidance" def _maybe_build_mm_uuids( self, diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 8ac5ea4129f7..d2c09e2a1f9d 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -83,9 +83,10 @@ def grammar_init(self, request: Request) -> None: # # NOTE: We only support a single backend. We do NOT support different # backends on a per-request basis in V1 (for now, anyway...). + # _backend is set in Processor._validate_structured_output if self.backend is None: assert request.sampling_params is not None - backend = self.vllm_config.structured_outputs_config.backend + backend = request.sampling_params.structured_outputs._backend vocab_size = self.vllm_config.model_config.get_vocab_size() if backend == "xgrammar": self.backend = XgrammarBackend( From bd5ef9476b9cecc5c1303daa82aec1300eefc2bd Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 17 Sep 2025 00:25:27 +0200 Subject: [PATCH 38/43] Remove badly merged change Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/openai/test_chat.py | 55 +-------------------------- 1 file changed, 1 insertion(+), 54 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 08c5b37e683b..04876d29becb 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -12,7 +12,7 @@ import regex as re import requests import torch -from openai import BadRequestError, OpenAI +from openai import BadRequestError from ...utils import RemoteOpenAIServer @@ -975,59 +975,6 @@ async def test_long_seed(client: openai.AsyncOpenAI): or "less_than_equal" in exc_info.value.message) -@pytest.mark.asyncio -async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer): - url = f"http://localhost:{server.port}/v1/chat/completions" - headers = { - "Content-Type": "application/json", - } - data = { - # model_name is avoided here. - "messages": [{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "what is 1+1?" - }], - "max_tokens": - 5 - } - - response = requests.post(url, headers=headers, json=data) - response_data = response.json() - print(response_data) - assert response_data.get("model") == MODEL_NAME - choice = response_data.get("choices")[0] - message = choice.get("message") - assert message is not None - content = message.get("content") - assert content is not None - assert len(content) > 0 - - -@pytest.mark.asyncio -async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer): - openai_api_key = "EMPTY" - openai_api_base = f"http://localhost:{server.port}/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - messages = [ - { - "role": "user", - "content": "Hello, vLLM!" - }, - ] - response = client.chat.completions.create( - model="", # empty string - messages=messages, - ) - assert response.model == MODEL_NAME - - @pytest.mark.asyncio async def test_invocations(server: RemoteOpenAIServer, client: openai.AsyncOpenAI): From 8b38bc4aeca6e4a8d1043c172ba074769656d374 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 17 Sep 2025 00:43:27 +0200 Subject: [PATCH 39/43] Fix opinionated backend selection part 2 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/sampling_params.py | 2 ++ vllm/v1/engine/processor.py | 20 ++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index cac7f72a72d3..0a01cb0260ae 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -44,6 +44,8 @@ class StructuredOutputsParams: _backend: Optional[str] = field(default=None, init=False) """CAUTION: Should only be set by Processor._validate_structured_output""" + _backend_was_auto: bool = field(default=False, init=False) + """CAUTION: Should only be set by Processor._validate_structured_output""" def __post_init__(self): """Validate that some fields are mutually exclusive.""" diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 717a5ba64d37..4766b3039f7d 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -216,10 +216,20 @@ def _validate_structured_output(self, params: SamplingParams) -> None: ) backend = self.structured_outputs_config.backend - if params.structured_outputs._backend and backend != "auto": - raise ValueError( - "StructuredOutputsParams._backend should only be set here if " - "StructuredOutputsConfig.backend is 'auto'.") + if _backend := params.structured_outputs._backend: + # Request-level backend selection is not supported. + # The values may differ if `params` is reused and was set + # to a specific backend based on `auto` behavior in a previous + # request. We remember that it was set as a result of `auto` + # using the `_auto` option set on the backend in the params. + if (backend != _backend + and not (backend == "auto" + and params.structured_outputs._backend_was_auto)): + raise ValueError( + "Request-level structured output backend selection is not " + f"supported. The request specified '{_backend}', but vLLM " + f"was initialised with '{backend}'. This error can be " + "resolved by removing '_backend' from the request.") else: params.structured_outputs._backend = backend @@ -262,6 +272,8 @@ def _validate_structured_output(self, params: SamplingParams) -> None: # are not supported in xgrammar. Fall back to guidance. validate_guidance_grammar(params, tokenizer=None) params.structured_outputs._backend = "guidance" + # Remember that this backend was set automatically + params.structured_outputs._backend_was_auto = True def _maybe_build_mm_uuids( self, From 76cb011f1446646e77e90ea5283d9e24099cebda Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 17 Sep 2025 00:51:50 +0200 Subject: [PATCH 40/43] Fix comment Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/v1/engine/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 4766b3039f7d..9c25e043c2d1 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -221,7 +221,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None: # The values may differ if `params` is reused and was set # to a specific backend based on `auto` behavior in a previous # request. We remember that it was set as a result of `auto` - # using the `_auto` option set on the backend in the params. + # using the `_backend_was_auto` field set in the params. if (backend != _backend and not (backend == "auto" and params.structured_outputs._backend_was_auto)): From f869f9caa8ded7761fb6208dd7ffa4f817588ae3 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 17 Sep 2025 13:39:12 +0200 Subject: [PATCH 41/43] Make failing test less flaky Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/entrypoints/openai/test_chat.py | 60 +++++++++++---------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index cbd3731096f0..a827f94cfbfe 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -670,10 +670,23 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, }, { "role": "user", - "content": - f"Give an example JSON for an employee profile that " - f"fits this schema: {sample_json_schema}" + "content": ("Give an example JSON for an employee " + "profile using the specified tool.") + }] + tools = [{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": sample_json_schema + } }] + tool_choice = { + "type": "function", + "function": { + "name": "dummy_function_name" + } + } # non-streaming @@ -681,20 +694,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, model=MODEL_NAME, messages=messages, max_completion_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": sample_json_schema - } - }], - tool_choice={ - "type": "function", - "function": { - "name": "dummy_function_name" - } - }, + tools=tools, + tool_choice=tool_choice, ) message = chat_completion.choices[0].message assert len(message.content) == 0 @@ -712,25 +713,12 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, # streaming - stream = await client.chat.completions.create( - model=MODEL_NAME, - messages=messages, - max_completion_tokens=1000, - tools=[{ - "type": "function", - "function": { - "name": "dummy_function_name", - "description": "This is a dummy function", - "parameters": sample_json_schema - } - }], - tool_choice={ - "type": "function", - "function": { - "name": "dummy_function_name" - } - }, - stream=True) + stream = await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + max_completion_tokens=1000, + tools=tools, + tool_choice=tool_choice, + stream=True) output = [] finish_reason_count = 0 From ec94b4a5099aa97ff3afa84170cd9f5ae7e5b804 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:50:34 +0200 Subject: [PATCH 42/43] Fix structured output being enabled by response format and tool calling Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/openai/protocol.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index d0506a1040fe..9c5e5766db02 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -640,23 +640,32 @@ def to_sampling_params( if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - if self.structured_outputs is not None: - if self.response_format is not None: - if self.response_format.type == "json_object": + if ((response_format := self.response_format) is not None or + (tool_json := self._get_json_schema_from_tool()) is not None): + # If structured outputs wasn't already enabled, + # we must enable it for these features to work + if self.structured_outputs is None: + self.structured_outputs = StructuredOutputsParams() + + # Set structured output params for response format + if response_format is not None: + if response_format.type == "json_object": self.structured_outputs.json_object = True - elif self.response_format.type == "json_schema": - json_schema = self.response_format.json_schema + elif response_format.type == "json_schema": + json_schema = response_format.json_schema assert json_schema is not None self.structured_outputs.json = json_schema.json_schema - elif self.response_format.type == "structural_tag": - structural_tag = self.response_format + elif response_format.type == "structural_tag": + structural_tag = response_format assert structural_tag is not None and isinstance( structural_tag, StructuralTagResponseFormat) s_tag_obj = structural_tag.model_dump(by_alias=True) self.structured_outputs.structural_tag = json.dumps( s_tag_obj) - if structured_outputs_json := self._get_json_schema_from_tool(): - self.structured_outputs.json = structured_outputs_json + + # Set structured output params for tool calling + if tool_json is not None: + self.structured_outputs.json = tool_json extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: From 5872fe7e392eed1491073e0c569d61f54785e8a5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 18 Sep 2025 08:45:20 +0200 Subject: [PATCH 43/43] Fix test Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/entrypoints/openai/protocol.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 2bea57dd653e..cff4a45fdc43 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -669,8 +669,9 @@ def to_sampling_params( if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - if ((response_format := self.response_format) is not None or - (tool_json := self._get_json_schema_from_tool()) is not None): + response_format = self.response_format + json_schema_from_tool = self._get_json_schema_from_tool() + if response_format is not None or json_schema_from_tool is not None: # If structured outputs wasn't already enabled, # we must enable it for these features to work if self.structured_outputs is None: @@ -693,8 +694,8 @@ def to_sampling_params( s_tag_obj) # Set structured output params for tool calling - if tool_json is not None: - self.structured_outputs.json = tool_json + if json_schema_from_tool is not None: + self.structured_outputs.json = json_schema_from_tool extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: