From 901d7054983d66f77a18844e70e5a997d92caaf8 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 28 Mar 2025 20:21:29 +0000
Subject: [PATCH 1/2] [V1] Set structured output backend to `auto` by default

The previous default was `xgrammar` and users had to opt-in to fallback
behavior. After more thought, `auto` seems like a better default as it
lets us do our best to satisfy all requests. Users can still pin vllm to
a single backend if desired.

Make `auto` work for V0 in case it gets specified there, as well.

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/config.py                                  | 4 ++--
 vllm/engine/arg_utils.py                        | 6 +++---
 vllm/model_executor/guided_decoding/__init__.py | 6 ++++++
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 2662c6a84990..558cc095952d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2888,7 +2888,7 @@ class DecodingConfig:
 
     # Which guided decoding algo to use.
     # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
-    guided_decoding_backend: str = 'xgrammar'
+    guided_decoding_backend: str = "auto" if envs.VLLM_USE_V1 else "xgrammar"
 
     reasoning_backend: Optional[str] = None
 
@@ -2913,7 +2913,7 @@ def compute_hash(self) -> str:
 
     def __post_init__(self):
         v0_valid_guided_backends = [
-            'outlines', 'lm-format-enforcer', 'xgrammar'
+            'outlines', 'lm-format-enforcer', 'xgrammar', 'auto'
         ]
         v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0c81e3edbe33..3699c711148f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -181,7 +181,7 @@ class EngineArgs:
     enable_chunked_prefill: Optional[bool] = None
     disable_chunked_mm_input: bool = False
 
-    guided_decoding_backend: str = 'xgrammar'
+    guided_decoding_backend: str = DecodingConfig.guided_decoding_backend
     logits_processor_pattern: Optional[str] = None
 
     speculative_config: Optional[Dict[str, Any]] = None
@@ -381,13 +381,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--guided-decoding-backend',
             type=str,
-            default='xgrammar',
+            default=DecodingConfig.guided_decoding_backend,
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
             'https://github.com/mlc-ai/xgrammar and '
             'https://github.com/guidance-ai/llguidance.'
             'Valid backend values are "xgrammar", "guidance", and "auto". '
-            'With "auto", we will make opinionated choices based on request'
+            'With "auto", we will make opinionated choices based on request '
             'contents and what the backend libraries currently support, so '
             'the behavior is subject to change in each release.')
         parser.add_argument(
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index d4fd11fd2e30..6f0eede74b5a 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -33,6 +33,12 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
         logger.warning("%s Falling back to use %s instead.", message, fallback)
         guided_params.backend = fallback
 
+    # `auto` was added for V1 to explicitly declare a mode that has fallbacks
+    # in place. If that is specified with V0, treat it as `xgrammar`, as we have
+    # fallbacks enabled for that and it is the V0 default.
+    if guided_params.backend == "auto":
+        guided_params.backend = "xgrammar"
+
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
     if guided_params.backend_name == "lm-format-enforcer":
         if guided_params.grammar is not None:

From f3b3cee8bf8b099c1c7a3a93edc887caa4f714e1 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 10 Apr 2025 10:15:47 -0400
Subject: [PATCH 2/2] Fix tests to be more V1 friendly

These tests now run against V1 in CI, but the code was originally
written with V0 in mind. In particular, specifying the backend
through the OpenAI API is not supported with V1, so we can remove
it and speed up the tests quite a bit. Different backends are tested in
another place (via the llm entrypoint tests).

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 tests/entrypoints/openai/test_chat.py | 74 ++++-----------------------
 1 file changed, 11 insertions(+), 63 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index b83c37a9032d..a10b42ea3a4b 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -20,8 +20,6 @@
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
-
 
 @pytest.fixture(scope="module")
 def monkeypatch_module():
@@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
     assert last_completion_tokens == 10
 
 
-# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
-# (i.e. using the same ordering as in the Completions API tests), the test
-# will fail on the second `guided_decoding_backend` even when I swap their order
-# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
-                                  is_v1_server: bool,
-                                  guided_decoding_backend: str,
                                   sample_guided_choice):
-
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
-
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
     choice1 = chat_completion.choices[0].message.content
     assert choice1 in sample_guided_choice
 
@@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
         messages=messages,
         max_completion_tokens=10,
         temperature=0.7,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
     choice2 = chat_completion.choices[0].message.content
     assert choice2 in sample_guided_choice
     assert choice1 != choice2
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
-                                guided_decoding_backend: str,
+async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                 sample_json_schema):
 
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported in V1")
-
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json1 = json.loads(message.content)
@@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=1000,
-        extra_body=dict(guided_json=sample_json_schema,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_json=sample_json_schema))
     message = chat_completion.choices[0].message
     assert message.content is not None
     json2 = json.loads(message.content)
@@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_regex_chat(client: openai.AsyncOpenAI,
-                                 is_v1_server: bool,
-                                 guided_decoding_backend: str, sample_regex):
-
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
+async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
 
     messages = [{
         "role": "system",
@@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
     ip1 = chat_completion.choices[0].message.content
     assert ip1 is not None
     assert re.fullmatch(sample_regex, ip1) is not None
@@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
         model=MODEL_NAME,
         messages=messages,
         max_completion_tokens=20,
-        extra_body=dict(guided_regex=sample_regex,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_regex=sample_regex))
     ip2 = chat_completion.choices[0].message.content
     assert ip2 is not None
     assert re.fullmatch(sample_regex, ip2) is not None
@@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
-                                           is_v1_server: bool,
-                                           guided_decoding_backend: str,
                                            sample_guided_choice):
 
-    if is_v1_server and guided_decoding_backend != 'xgrammar':
-        pytest.skip("Only xgrammar backend is supported with V1")
-
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
         max_completion_tokens=10,
         logprobs=True,
         top_logprobs=5,
-        extra_body=dict(guided_choice=sample_guided_choice,
-                        guided_decoding_backend=guided_decoding_backend))
+        extra_body=dict(guided_choice=sample_guided_choice))
 
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.content is not None
@@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
-                              guided_decoding_backend: str,
-                              sample_json_schema):
-
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
-
+async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                 "name": "dummy_function_name"
             }
         },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
+    )
     message = chat_completion.choices[0].message
     assert len(message.content) == 0
     json_string = message.tool_calls[0].function.arguments
@@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                 "name": "dummy_function_name"
             }
         },
-        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
         stream=True)
 
     output = []
@@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
         model=model_name,
         tools=tools,
         tool_choice="required",
-        extra_body=dict(guided_decoding_backend="outlines"),
     )
 
     assert chat_completion.choices[0].message.tool_calls is not None
@@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
         model=model_name,
         tools=tools,
         tool_choice="required",
-        extra_body=dict(guided_decoding_backend="outlines"),
         stream=True,
     )
 
@@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  is_v1_server: bool,
                                                   sample_json_schema):
-
-    if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
-
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"