From 7fe0d40de4fc61810afbf7e16da988c33a8a2bd6 Mon Sep 17 00:00:00 2001
From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Date: Tue, 12 Aug 2025 17:13:43 +0000
Subject: [PATCH 1/5] [https://nvbugs/5375594][fix] fix oom issue on
 structural_tag test case

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt                      | 1 -
 .../unittest/llmapi/apps/_test_openai_chat_structural_tag.py | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 2e0037bcfc5..3597a6baa3e 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -43,7 +43,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bf
 examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4866931)
 examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https://nvbugs/4961624)
 examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371)
-test_e2e.py::test_openai_chat_structural_tag_example SKIP (https://nvbugspro.nvidia.com/bug/5375594)
 cpp/test_e2e.py::test_model[fp8-chatglm-90] SKIP (https://nvbugs/5034830)
 full:B200_PCIe/unittest/trt/functional SKIP (Disable for Blackwell)
 full:B200_PCIe/unittest/trt/quantization SKIP (Disable for Blackwell)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index e3411404947..e1bf48f31c6 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -23,7 +23,10 @@ def temp_extra_llm_api_options_file(request):
     temp_dir = tempfile.gettempdir()
     temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
     try:
-        extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
+        extra_llm_api_options_dict = {
+            "guided_decoding_backend": "xgrammar",
+            "max_batch_size": 32
+        }
 
         with open(temp_file_path, 'w') as f:
             yaml.dump(extra_llm_api_options_dict, f)

From 0e2a5c13efe0966bcb7483765ff79e03cab93872 Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Wed, 13 Aug 2025 02:07:07 +0000
Subject: [PATCH 2/5] fix test

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
 .../apps/_test_openai_chat_structural_tag.py  | 56 +++++++++++--------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index e1bf48f31c6..c35aa182f00 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -1,32 +1,32 @@
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/aae6927be06dedbda39c6b0c30f6aa3242b84388/tests/entrypoints/openai/test_chat.py
+import json
 import os
+import re
 import tempfile
 
+import jsonschema
 import openai
 import pytest
 import yaml
 
-from ..test_llm import get_model_path, similar
+from ..test_llm import get_model_path
 from .openai_server import RemoteOpenAIServer
 
 pytestmark = pytest.mark.threadleak(enabled=False)
 
 
-@pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
+@pytest.fixture(scope="module")
 def model_name():
     return "llama-3.1-model/Llama-3.1-8B-Instruct"
 
 
 @pytest.fixture(scope="module")
-def temp_extra_llm_api_options_file(request):
+def temp_extra_llm_api_options_file():
     temp_dir = tempfile.gettempdir()
     temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
     try:
-        extra_llm_api_options_dict = {
-            "guided_decoding_backend": "xgrammar",
-            "max_batch_size": 32
-        }
+        extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
 
         with open(temp_file_path, 'w') as f:
             yaml.dump(extra_llm_api_options_dict, f)
@@ -40,7 +40,11 @@ def temp_extra_llm_api_options_file(request):
 @pytest.fixture(scope="module")
 def server(model_name: str, temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
-    args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]
+
+    args = [
+        "--max_batch_size=32",
+        f"--extra_llm_api_options={temp_extra_llm_api_options_file}"
+    ]
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
 
@@ -115,12 +119,7 @@ def tool_get_current_date():
 
 def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
                              tool_get_current_weather, tool_get_current_date):
-    messages = [
-        {
-            "role":
-            "system",
-            "content":
-            f"""
+    system_prompt = f"""
 # Tool Instructions
 - Always execute python code in messages that you share.
 - When looking for real time information use relevant functions if available else fallback to brave_search
@@ -143,13 +142,17 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
 - Only call one function at a time
 - Put the entire function call reply on one line
 - Always add your sources when using search results to answer the user query
-You are a helpful assistant.""",
+You are a helpful assistant."""
+    user_prompt = "You are in New York. Please get the current date and time, and the weather."
+
+    messages = [
+        {
+            "role": "system",
+            "content": system_prompt,
         },
         {
-            "role":
-            "user",
-            "content":
-            "You are in New York. Please get the current date and time, and the weather.",
+            "role": "user",
+            "content": user_prompt,
         },
     ]
 
@@ -176,11 +179,18 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
             "triggers": ["<function="],
         },
     )
-    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
+
     message = chat_completion.choices[0].message
     assert message.content is not None
     assert message.role == "assistant"
 
-    reference = '<function=get_current_date>{"timezone": "America/New_York"}</function>\n<function=get_current_weather>{"city": "New York", "state": "NY", "unit": "fahrenheit"}</function>\n\nSources:\n- get_current_date function\n- get_current_weather function'
-    assert similar(chat_completion.choices[0].message.content, reference)
+    match = re.search(r'<function=get_current_weather>([\S\s]+?)</function>',
+                      message.content)
+    params = json.loads(match.group(1))
+    jsonschema.validate(params,
+                        tool_get_current_weather["function"]["parameters"])
+
+    match = re.search(r'<function=get_current_date>([\S\s]+?)</function>',
+                      message.content)
+    params = json.loads(match.group(1))
+    jsonschema.validate(params, tool_get_current_date["function"]["parameters"])

From 671ac9bd3442ac82027742e0317228a8db10aba2 Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Wed, 13 Aug 2025 02:17:03 +0000
Subject: [PATCH 3/5] add req

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
 requirements-dev.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index c8293761eaa..986026e5303 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -24,6 +24,7 @@ cloudpickle
 typing-extensions==4.12.2
 bandit==1.7.7
 jsonlines==4.0.0
+jsonschema
 jieba==0.42.1
 rouge==1.0.1
 pytest-rerunfailures

From c597d603dc78e5e51a53cf6a06cb8db0b8dab9fb Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Wed, 13 Aug 2025 06:05:05 +0000
Subject: [PATCH 4/5] fix oom

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
 .../unittest/llmapi/apps/_test_openai_chat_structural_tag.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index c35aa182f00..022b5a89863 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -41,8 +41,9 @@ def temp_extra_llm_api_options_file():
 def server(model_name: str, temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
 
+    # Use small max_batch_size/max_seq_len/max_num_tokens to avoid OOM on A10/A30 GPUs.
     args = [
-        "--max_batch_size=32",
+        "--max_batch_size=8", "--max_seq_len=1024", "--max_num_tokens=1024",
         f"--extra_llm_api_options={temp_extra_llm_api_options_file}"
     ]
     with RemoteOpenAIServer(model_path, args) as remote_server:
@@ -159,7 +160,7 @@ def test_chat_structural_tag(client: openai.OpenAI, model_name: str,
     chat_completion = client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=100,
+        max_completion_tokens=256,
         response_format={
             "type":
             "structural_tag",

From 64123876dace0aabe2ef681baa630cf0c39294db Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Wed, 13 Aug 2025 07:34:52 +0000
Subject: [PATCH 5/5] clean

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
 requirements-dev.txt                                 | 1 -
 tests/unittest/llmapi/apps/_test_openai_chat_json.py | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 986026e5303..c8293761eaa 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -24,7 +24,6 @@ cloudpickle
 typing-extensions==4.12.2
 bandit==1.7.7
 jsonlines==4.0.0
-jsonschema
 jieba==0.42.1
 rouge==1.0.1
 pytest-rerunfailures
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_json.py b/tests/unittest/llmapi/apps/_test_openai_chat_json.py
index a444b5566b8..53651828507 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_json.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_json.py
@@ -26,11 +26,7 @@ def temp_extra_llm_api_options_file(request):
     temp_dir = tempfile.gettempdir()
     temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
     try:
-        extra_llm_api_options_dict = {
-            "guided_decoding_backend": "xgrammar",
-            "disable_overlap_scheduler":
-            True,  # Guided decoding is not supported with overlap scheduler
-        }
+        extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
 
         with open(temp_file_path, "w") as f:
             yaml.dump(extra_llm_api_options_dict, f)